diff --git a/src/ConvNetSharp.Core/ConvNetSharp.Core.Nuget.csproj b/src/ConvNetSharp.Core/ConvNetSharp.Core.Nuget.csproj index 0ccb0329..e9858f42 100644 --- a/src/ConvNetSharp.Core/ConvNetSharp.Core.Nuget.csproj +++ b/src/ConvNetSharp.Core/ConvNetSharp.Core.Nuget.csproj @@ -1,8 +1,9 @@  - netstandard2.1 - + netstandard2.0 + 8.0 + 0.4.14 diff --git a/src/ConvNetSharp.Core/ConvNetSharp.Core.csproj b/src/ConvNetSharp.Core/ConvNetSharp.Core.csproj index 9e4cff38..e1ab2e78 100644 --- a/src/ConvNetSharp.Core/ConvNetSharp.Core.csproj +++ b/src/ConvNetSharp.Core/ConvNetSharp.Core.csproj @@ -1,7 +1,8 @@  - netstandard2.1 + netstandard2.0 + 8.0 diff --git a/src/ConvNetSharp.Core/Net.cs b/src/ConvNetSharp.Core/Net.cs index 449336a5..1f25ba9b 100644 --- a/src/ConvNetSharp.Core/Net.cs +++ b/src/ConvNetSharp.Core/Net.cs @@ -27,7 +27,8 @@ public T GetCostLoss(Volume input, Volume y) { this.Forward(input); - if (this.Layers[^1] is ILastLayer lastLayer) + var n = this.Layers.Count; + if (this.Layers[n - 1] is ILastLayer lastLayer) { lastLayer.Backward(y, out var loss); return loss; @@ -58,7 +59,8 @@ public int[] GetPrediction() { // this is a convenience function for returning the argmax // prediction, assuming the last layer of the net is a softmax - if (!(this.Layers[^1] is SoftmaxLayer softmaxLayer)) + var ln = this.Layers.Count; + if (!(this.Layers[ln - 1] is SoftmaxLayer softmaxLayer)) { throw new Exception("GetPrediction function assumes softmax as last layer of the net!"); } @@ -109,10 +111,11 @@ public void AddLayer(LayerBase layer) if (this.Layers.Count > 0) { - inputWidth = this.Layers[^1].OutputWidth; - inputHeight = this.Layers[^1].OutputHeight; - inputDepth = this.Layers[^1].OutputDepth; - lastLayer = this.Layers[^1]; + var n = this.Layers.Count; + inputWidth = this.Layers[n - 1].OutputWidth; + inputHeight = this.Layers[n - 1].OutputHeight; + inputDepth = this.Layers[n - 1].OutputDepth; + lastLayer = this.Layers[n - 1]; } else if (!(layer is InputLayer)) { diff --git a/src/ConvNetSharp.Flow/ConvNetSharp.Flow.Nuget.csproj b/src/ConvNetSharp.Flow/ConvNetSharp.Flow.Nuget.csproj index 75d59359..15f23971 100644 --- a/src/ConvNetSharp.Flow/ConvNetSharp.Flow.Nuget.csproj +++ b/src/ConvNetSharp.Flow/ConvNetSharp.Flow.Nuget.csproj @@ -1,8 +1,9 @@  - netstandard2.1 - + netstandard2.0 + 8.0 + 0.4.14 diff --git a/src/ConvNetSharp.Flow/ConvNetSharp.Flow.csproj b/src/ConvNetSharp.Flow/ConvNetSharp.Flow.csproj index b8e2222e..05ff697e 100644 --- a/src/ConvNetSharp.Flow/ConvNetSharp.Flow.csproj +++ b/src/ConvNetSharp.Flow/ConvNetSharp.Flow.csproj @@ -1,8 +1,9 @@  - netstandard2.1 - + netstandard2.0 + 8.0 + diff --git a/src/ConvNetSharp.Utils/ConvNetSharp.Utils.Nuget.csproj b/src/ConvNetSharp.Utils/ConvNetSharp.Utils.Nuget.csproj index 3bfc323a..2c32f840 100644 --- a/src/ConvNetSharp.Utils/ConvNetSharp.Utils.Nuget.csproj +++ b/src/ConvNetSharp.Utils/ConvNetSharp.Utils.Nuget.csproj @@ -1,7 +1,8 @@  - netstandard2.1 + netstandard2.0 + 8.0 diff --git a/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.Nuget.csproj b/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.Nuget.csproj index 4cf7fc84..432df6d4 100644 --- a/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.Nuget.csproj +++ b/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.Nuget.csproj @@ -1,7 +1,8 @@  - netstandard2.1 + netstandard2.0 + 8.0 diff --git a/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.csproj b/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.csproj index 15e8bb3f..c6fbf541 100644 --- a/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.csproj +++ b/src/ConvNetSharp.Volume.GPU/ConvNetSharp.Volume.GPU.csproj @@ -1,6 +1,7 @@  - netstandard2.1 + netstandard2.0 + 8.0 true diff --git a/src/ConvNetSharp.Volume.GPU/Double/Volume.cs b/src/ConvNetSharp.Volume.GPU/Double/Volume.cs index 548ccfb2..0e98e70e 100644 --- a/src/ConvNetSharp.Volume.GPU/Double/Volume.cs +++ b/src/ConvNetSharp.Volume.GPU/Double/Volume.cs @@ -288,10 +288,13 @@ public override void Convolution(Volume filters, int xpad, int ypad, int result.Shape.Dimensions[1], result.Shape.Dimensions[0]); - var algo = this._context.CudnnContext.GetConvolutionForwardAlgorithm( - dataDesc, filterDesc, - convolutionDesc, outputDesc, - cudnnConvolutionFwdPreference.PreferFastest, IntPtr.Zero); + var algo = this._context.CudnnContext.FindConvolutionForwardAlgorithm( + dataDesc, + filterDesc, + convolutionDesc, + outputDesc, + 1 + ).First().algo; var workspaceSize = this._context.CudnnContext.GetConvolutionForwardWorkspaceSize( dataDesc, filterDesc, @@ -373,14 +376,24 @@ public override void ConvolutionGradient(Volume filters, Volume filters.Shape.Dimensions[1], filters.Shape.Dimensions[0]); - var filterAlgo = this._context.CudnnContext.GetConvolutionBackwardFilterAlgorithm(dataDesc, dOutputDesc, - convolutionDesc, dfilterDesc, cudnnConvolutionBwdFilterPreference.PreferFastest, IntPtr.Zero); + var filterAlgo = this._context.CudnnContext.FindConvolutionBackwardFilterAlgorithm( + dataDesc, + dOutputDesc, + convolutionDesc, + dfilterDesc, + 1 + ).First().algo; var filterWorkspaceSize = this._context.CudnnContext.GetConvolutionBackwardFilterWorkspaceSize(dataDesc, dOutputDesc, convolutionDesc, dfilterDesc, filterAlgo); filterWorkspaceSize = filterWorkspaceSize == 0 ? new SizeT(1) : filterWorkspaceSize; - var dataAlgo = this._context.CudnnContext.GetConvolutionBackwardDataAlgorithm(filterDesc, dOutputDesc, - convolutionDesc, dDataDesc, cudnnConvolutionBwdDataPreference.PreferFastest, IntPtr.Zero); + var dataAlgo = this._context.CudnnContext.FindConvolutionBackwardDataAlgorithm( + filterDesc, + dOutputDesc, + convolutionDesc, + dDataDesc, + 1 + ).First().algo; var dataWorkspaceSize = this._context.CudnnContext.GetConvolutionBackwardDataWorkspaceSize(dfilterDesc, dOutputDesc, convolutionDesc, dDataDesc, dataAlgo); dataWorkspaceSize = dataWorkspaceSize == 0 ? new SizeT(1) : dataWorkspaceSize; diff --git a/src/ConvNetSharp.Volume.GPU/Single/Volume.cs b/src/ConvNetSharp.Volume.GPU/Single/Volume.cs index 29ef7999..754a8ba4 100644 --- a/src/ConvNetSharp.Volume.GPU/Single/Volume.cs +++ b/src/ConvNetSharp.Volume.GPU/Single/Volume.cs @@ -287,10 +287,13 @@ public override void Convolution(Volume filters, int xpad, int ypad, int result.Shape.Dimensions[1], result.Shape.Dimensions[0]); - var algo = this._context.CudnnContext.GetConvolutionForwardAlgorithm( - dataDesc, filterDesc, - convolutionDesc, outputDesc, - cudnnConvolutionFwdPreference.PreferFastest, IntPtr.Zero); + var algo = this._context.CudnnContext.FindConvolutionForwardAlgorithm( + dataDesc, + filterDesc, + convolutionDesc, + outputDesc, + 1 + ).First().algo; var workspaceSize = this._context.CudnnContext.GetConvolutionForwardWorkspaceSize( dataDesc, filterDesc, @@ -379,14 +382,24 @@ public override void ConvolutionGradient(Volume filters, Volume ou filters.Shape.Dimensions[1], filters.Shape.Dimensions[0]); - var filterAlgo = this._context.CudnnContext.GetConvolutionBackwardFilterAlgorithm(dataDesc, dOutputDesc, - convolutionDesc, dfilterDesc, cudnnConvolutionBwdFilterPreference.PreferFastest, IntPtr.Zero); + var filterAlgo = this._context.CudnnContext.FindConvolutionBackwardFilterAlgorithm( + dataDesc, + dOutputDesc, + convolutionDesc, + dfilterDesc, + 1 + ).First().algo; var filterWorkspaceSize = this._context.CudnnContext.GetConvolutionBackwardFilterWorkspaceSize(dataDesc, dOutputDesc, convolutionDesc, dfilterDesc, filterAlgo); filterWorkspaceSize = filterWorkspaceSize == 0 ? new SizeT(1) : filterWorkspaceSize; - var dataAlgo = this._context.CudnnContext.GetConvolutionBackwardDataAlgorithm(filterDesc, dOutputDesc, - convolutionDesc, dDataDesc, cudnnConvolutionBwdDataPreference.PreferFastest, IntPtr.Zero); + var dataAlgo = this._context.CudnnContext.FindConvolutionBackwardDataAlgorithm( + filterDesc, + dOutputDesc, + convolutionDesc, + dDataDesc, + 1 + ).First().algo; var dataWorkspaceSize = this._context.CudnnContext.GetConvolutionBackwardDataWorkspaceSize(dfilterDesc, dOutputDesc, convolutionDesc, dDataDesc, dataAlgo); dataWorkspaceSize = dataWorkspaceSize == 0 ? new SizeT(1) : dataWorkspaceSize; diff --git a/src/ConvNetSharp.Volume/ConvNetSharp.Volume.csproj b/src/ConvNetSharp.Volume/ConvNetSharp.Volume.csproj index 1f4da1ab..efa484f9 100644 --- a/src/ConvNetSharp.Volume/ConvNetSharp.Volume.csproj +++ b/src/ConvNetSharp.Volume/ConvNetSharp.Volume.csproj @@ -1,7 +1,8 @@  - netstandard2.1 + netstandard2.0 + 8.0 diff --git a/src/external/ManagedCuda/CudaBlas.XML b/src/external/ManagedCuda/CudaBlas.XML index 14b922bc..87f7ac60 100644 --- a/src/external/ManagedCuda/CudaBlas.XML +++ b/src/external/ManagedCuda/CudaBlas.XML @@ -21,6 +21,10 @@ + + + + @@ -53,6 +57,26 @@ + + + + + + + + + + + + + + + + + + + + copies n elements from a vector x in CPU memory space to a vector y @@ -163,6 +187,10 @@ , . + + + + @@ -195,6 +223,10 @@ + + + + @@ -311,6 +343,10 @@ + + + + @@ -327,6 +363,14 @@ + + + + + + + + @@ -367,6 +411,10 @@ + + + + @@ -383,6 +431,10 @@ + + + + @@ -391,6 +443,10 @@ + + + + @@ -399,6 +455,10 @@ + + + + @@ -515,6 +575,10 @@ + + + + @@ -531,6 +595,14 @@ + + + + + + + + @@ -571,6 +643,10 @@ + + + + @@ -587,6 +663,10 @@ + + + + @@ -595,6 +675,10 @@ + + + + @@ -603,6 +687,10 @@ + + + + @@ -1139,11 +1227,11 @@ - + - + @@ -1563,11 +1651,11 @@ - + - + @@ -1779,6155 +1867,7990 @@ - + + copies n elements from a vector x in CPU memory space to a vector y + in GPU memory space. Elements in both vectors are assumed to have a + size of elemSize bytes. Storage spacing between consecutive elements + is incx for the source vector x and incy for the destination vector + y. In general, y points to an object, or part of an object, allocated + via cublasAlloc(). Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + CudaBlas Error Codes: , , + , . + - + + copies n elements from a vector x in GPU memory space to a vector y + in CPU memory space. Elements in both vectors are assumed to have a + size of elemSize bytes. Storage spacing between consecutive elements + is incx for the source vector x and incy for the destination vector + y. In general, x points to an object, or part of an object, allocated + via cublasAlloc(). Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + CudaBlas Error Codes: , , + , . + - + + copies a tile of rows x cols elements from a matrix A in CPU memory + space to a matrix B in GPU memory space. Each element requires storage + of elemSize bytes. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix A provided in lda, and the leading dimension of matrix B + provided in ldb. In general, B points to an object, or part of an + object, that was allocated via cublasAlloc(). + + CudaBlas Error Codes: , , + , . + - + - An CudaBlasException is thrown, if any wrapped call to the CUBLAS-library does not return . + copies a tile of rows x cols elements from a matrix A in GPU memory + space to a matrix B in CPU memory space. Each element requires storage + of elemSize bytes. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix A provided in lda, and the leading dimension of matrix B + provided in ldb. In general, A points to an object, or part of an + object, that was allocated via cublasAlloc(). + + CudaBlas Error Codes: , , + , . + - + - + cublasSetVectorAsync has the same functionnality as cublasSetVector + but the transfer is done asynchronously within the CUDA stream passed + in parameter. + + CudaBlas Error Codes: , , + , . + - + - + cublasGetVectorAsync has the same functionnality as cublasGetVector + but the transfer is done asynchronously within the CUDA stream passed + in parameter. - - + + CudaBlas Error Codes: , , + , . + - + - + cublasSetMatrixAsync has the same functionnality as cublasSetMatrix + but the transfer is done asynchronously within the CUDA stream passed + in parameter. - + + CudaBlas Error Codes: , , + , . + - + - + cublasGetMatrixAsync has the same functionnality as cublasGetMatrix + but the transfer is done asynchronously within the CUDA stream passed + in parameter. - + + CudaBlas Error Codes: , , + , . + - + - - - - + - - - - - + - - - + - - - - + - - + - Wrapper for CUBLAS - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - Creates a new cudaBlas handler - + - For dispose - + - Dispose - + - For IDisposable - - + - Returns the wrapped cublas handle - + - - + - - + - - + - - + - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function copies the vector x into the vector y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function interchanges the elements of vector x and y. - - - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the Euclidean norm of the vector x. - - - + - This function computes the Euclidean norm of the vector x. - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - - - - - + - This function computes the dot product of vectors x and y. - - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - + - This function computes the dot product of vectors x and y. - Notice that the conjugate of the element of vector x should be used. - - - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function scales the vector x by the scalar and overwrites it with the result. - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function multiplies the vector x by the scalar and adds it to the vector y overwriting - the latest vector with the result. - - - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the minimum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - + - This function finds the (smallest) index of the element of the maximum magnitude. - First index starts at 1 (Fortran notation) - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - + - This function computes the sum of the absolute values of the elements of vector x. - - - - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. - - - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T - - - Cosine component - Sine component - + - This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector - [sqrt(d1)*x1; sqrt(d2)*y1]. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector - [sqrt(d1)*x1; sqrt(d2)*y1]. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector - [sqrt(d1)*x1; sqrt(d2)*y1]. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector - [sqrt(d1)*x1; sqrt(d2)*y1]. - The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. - The flag = param[0] defines the following predefined values for the matrix H entries: - flag=-1.0: H = |h11 h12; h21 h22| - flag= 0.0: H = |1.0 h12; h21 1.0| - flag= 1.0: H = |h11 1.0; -1.0 h22| - flag=-2.0: H = |1.0 0.0; 0.0 1.0| - Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. - - - - - - + - This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in - lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in - lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in - lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in - lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or - upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or - upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or - upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or - upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. - The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of sub- and super-diagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - scalar used for multiplication. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha and beta are scalars. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix A. - number of columns of matrix A. - number of subdiagonals of matrix A. - number of superdiagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of sub- and super-diagonals of matrix A. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, - x and y are vectors, and alpha and beta are scalars. n is given by x.Size. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - array of dimensions lda * n, with lda >= max(1,n). - vector with n elements. - stride between consecutive elements of x. - scalar used for multiplication, if beta==0 then y does not have to be a valid input. - vector with n elements. - stride between consecutive elements of y. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, - x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of y. - array of dimensions lda * n, with lda >= max(1,n). - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, - x is a vector, and alpha is a scalar. n is given by x.Size. - indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - scalar used for multiplication. - vector with n elements. - stride between consecutive elements of x. - vector with n elements. - stride between consecutive elements of x. - array with A stored in packed format. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - enumerant specifying the datatype of matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - enumerant specifying the datatype of matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - enumerant specifying the datatype of matrix C. - + - This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*k, op(B) k*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - enumerant specifying the datatype of matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - enumerant specifying the datatype of matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - enumerant specifying the datatype of matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where - alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where - alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k - and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of columns of matrix op(B) and C. - number of columns of op(A) and rows of op(B). - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * k. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha - and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A - and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. - indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication, if beta==0, then C does not have to be a valid input. - array of dimensions ldc x n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - real scalar used for multiplication, if beta==0 then C does not have to be a valid input. - array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - real scalar used for multiplication, if beta==0 then C does not have to be a valid input. - array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - real scalar used for multiplication, if beta==0 then C does not have to be a valid input. - array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where - alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. - indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - number of rows of matrix op(A), op(B) and C. - number of columns of matrix op(A) and op(B). - scalar used for multiplication. - array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix A. - array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store matrix B. - real scalar used for multiplication, if beta==0 then C does not have to be a valid input. - array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right - where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. - indicates if matrix A is on the left or right of B. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - number of rows of matrix C and B, with matrix A sized accordingly. - number of columns of matrix C and B, with matrix A sized accordingly. - scalar used for multiplication. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. - The solution X overwrites the right-hand-sides B on exit. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right - where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. - Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results - written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in - the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. - indicates if matrix A is on the left or right of X. - indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. - operation op(A) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. - number of rows of matrix B, with matrix A sized accordingly. - number of columns of matrix B, with matrix A sized accordingly. - scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. - array of dimensions lda * m. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - array of dimensions ldc * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where - alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions - op(A) m*n, op(B) m*n and C m*n, respectively. - operation op(A) that is non- or (conj.) transpose. - operation op(B) that is non- or (conj.) transpose. - number of rows of matrix op(A) and C. - number of columns of matrix op(B) and C. - scalar used for multiplication. - array of dimensions lda * k. - leading dimension of two-dimensional array used to store matrix A. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix B. - scalar used for multiplication. - array of dimensions ldb * n. - leading dimension of two-dimensional array used to store matrix C. - + - - - - - - - - - + - - - - - - - - - + - - - - - - - - - + - - - - - - - - - + - This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or - C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. - where A and C are matrices stored in column-major format with dimensions m*n. X is a - vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == - CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The - absolute value of incx is the stride and the sign of incx is direction of the stride. If incx - is positive, then we forward x from the first element. Otherwise, we backward x from the - last element. - left multiply if mode == CUBLAS_SIDE_LEFT - or right multiply if mode == CUBLAS_SIDE_RIGHT - number of rows of matrix A and C. - number of columns of matrix A and C. - array of dimensions lda x n with lda >= max(1,m) - leading dimension of two-dimensional array used to store the matrix A. - one-dimensional array of size |incx|*m - if mode == CUBLAS_SIDE_LEFT and |incx|*n - if mode == CUBLAS_SIDE_RIGHT - stride of one-dimensional array x. - array of dimensions ldc*n with ldc >= max(1,m). - leading dimension of a two-dimensional array used to store the matrix C. - + - This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or - C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. - where A and C are matrices stored in column-major format with dimensions m*n. X is a - vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == - CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The - absolute value of incx is the stride and the sign of incx is direction of the stride. If incx - is positive, then we forward x from the first element. Otherwise, we backward x from the - last element. - left multiply if mode == CUBLAS_SIDE_LEFT - or right multiply if mode == CUBLAS_SIDE_RIGHT - number of rows of matrix A and C. - number of columns of matrix A and C. - array of dimensions lda x n with lda >= max(1,m) - leading dimension of two-dimensional array used to store the matrix A. - one-dimensional array of size |incx|*m - if mode == CUBLAS_SIDE_LEFT and |incx|*n - if mode == CUBLAS_SIDE_RIGHT - stride of one-dimensional array x. - array of dimensions ldc*n with ldc >= max(1,m). - leading dimension of a two-dimensional array used to store the matrix C. - + - This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or - C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. - where A and C are matrices stored in column-major format with dimensions m*n. X is a - vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == - CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The - absolute value of incx is the stride and the sign of incx is direction of the stride. If incx - is positive, then we forward x from the first element. Otherwise, we backward x from the - last element. - left multiply if mode == CUBLAS_SIDE_LEFT - or right multiply if mode == CUBLAS_SIDE_RIGHT - number of rows of matrix A and C. - number of columns of matrix A and C. - array of dimensions lda x n with lda >= max(1,m) - leading dimension of two-dimensional array used to store the matrix A. - one-dimensional array of size |incx|*m - if mode == CUBLAS_SIDE_LEFT and |incx|*n - if mode == CUBLAS_SIDE_RIGHT - stride of one-dimensional array x. - array of dimensions ldc*n with ldc >= max(1,m). - leading dimension of a two-dimensional array used to store the matrix C. - + - This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or - C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. - where A and C are matrices stored in column-major format with dimensions m*n. X is a - vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == - CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The - absolute value of incx is the stride and the sign of incx is direction of the stride. If incx - is positive, then we forward x from the first element. Otherwise, we backward x from the - last element. - left multiply if mode == CUBLAS_SIDE_LEFT - or right multiply if mode == CUBLAS_SIDE_RIGHT - number of rows of matrix A and C. - number of columns of matrix A and C. - array of dimensions lda x n with lda >= max(1,m) - leading dimension of two-dimensional array used to store the matrix A. - one-dimensional array of size |incx|*m - if mode == CUBLAS_SIDE_LEFT and |incx|*n - if mode == CUBLAS_SIDE_RIGHT - stride of one-dimensional array x. - array of dimensions ldc*n with ldc >= max(1,m). - leading dimension of a two-dimensional array used to store the matrix C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - - - - - - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - value of type long long int that gives the address offset between A[i] and A[i+1]. - value of type long long int that gives the address offset between B[i] and B[i+1]. - value of type long long int that gives the address offset between C[i] and C[i+1]. - pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - - - - - - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% - C = a op(A ) op(B ) + C - where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A - op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C - and op(B ) is defined similarly for matrix B. - Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - - - - - - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - value of type long long int that gives the address offset between A[i] and A[i+1]. - value of type long long int that gives the address offset between B[i] and B[i+1]. - value of type long long int that gives the address offset between C[i] and C[i+1]. - pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - - - - - - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. + An CudaBlasException is thrown, if any wrapped call to the CUBLAS-library does not return . - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% - C = a op(A ) op(B ) + C - where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A - op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C - and op(B ) is defined similarly for matrix B. - Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. - + - This function performs the matrix-matrix multiplications of an array of matrices. - where and are scalars, and , and are arrays of pointers to matrices stored - in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, - respectively. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. For small sizes, typically smaller than 100x100, - this function improves significantly performance compared to making calls to its - corresponding cublas]]>gemm routine. However, on GPU architectures that support - concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm - into different streams as the matrix sizes increase. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if - transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if - transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - number of pointers contained in A, B and C. + + - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. + - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. + - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. + + - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. + + + - + - This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", - i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) - for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located - at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first - instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine - the locations of input and output matrices in future instances. + - operation op(A[i]) that is non- or (conj.) transpose. - operation op(B[i]) that is non- or (conj.) transpose. - number of rows of matrix op(A[i]) and C[i]. - number of columns of op(B[i]) and C[i]. - number of columns of op(A[i]) and rows of op(B[i]). - scalar used for multiplication. - pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. - leading dimension of two-dimensional array used to store each matrix A[i]. - Value of type long long int that gives the address offset between A[i] and A[i+1] - pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. - leading dimension of two-dimensional array used to store each matrix B[i]. - Value of type long long int that gives the address offset between B[i] and B[i+1] - scalar used for multiplication. If beta == 0, C does not have to be a valid input. - pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix C[i]. - Value of type long long int that gives the address offset between C[i] and C[i+1] - number of GEMMs to perform in the batch. + - + - This function performs the LU factorization of an array of n x n matrices. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimension n to 32. + - number of rows and columns of A[i]. - array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix A[i]. - array of size n x batchSize that contains the permutation vector - of each factorization of A[i] stored in a linear fashion. - If info=0, the execution is successful. - If info = -i, the i-th parameter had an illegal value. - If info = i, aii is 0. The factorization has been completed, but U is exactly singular. - number of pointers contained in A + + - + - This function performs the LU factorization of an array of n x n matrices. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimension n to 32. + - number of rows and columns of A[i]. - array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix A[i]. - array of size n x batchSize that contains the permutation vector - of each factorization of A[i] stored in a linear fashion. - If info=0, the execution is successful. - If info = -i, the i-th parameter had an illegal value. - If info = i, aii is 0. The factorization has been completed, but U is exactly singular. - number of pointers contained in A - + - This function performs the LU factorization of an array of n x n matrices. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimension n to 32. + Wrapper for CUBLAS - number of rows and columns of A[i]. - array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix A[i]. - array of size n x batchSize that contains the permutation vector - of each factorization of A[i] stored in a linear fashion. - If info=0, the execution is successful. - If info = -i, the i-th parameter had an illegal value. - If info = i, aii is 0. The factorization has been completed, but U is exactly singular. - number of pointers contained in A - + - This function performs the LU factorization of an array of n x n matrices. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimension n to 32. + Creates a new cudaBlas handler - number of rows and columns of A[i]. - array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix A[i]. - array of size n x batchSize that contains the permutation vector - of each factorization of A[i] stored in a linear fashion. - If info=0, the execution is successful. - If info = -i, the i-th parameter had an illegal value. - If info = i, aii is 0. The factorization has been completed, but U is exactly singular. - number of pointers contained in A - + - Aarray and Carray are arrays of pointers to matrices stored in column-major format - with dimensions n*n and leading dimension lda and ldc respectively. - This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. - Prior to calling GetriBatched, the matrix A[i] must be factorized first using - the routine GetrfBatched. After the call of GetrfBatched, the matrix - pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector - pointing by (PivotArray+i) will contain the pivoting sequence. - Following the LU factorization, GetriBatched uses forward and backward - triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The - inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of - Array[i]. + Creates a new cudaBlas handler - number of rows and columns of Aarray[i]. - array of pointers to array, with each array of dimension n*n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. - array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. - If info=0, the execution is successful. - If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. - number of pointers contained in A - + - Aarray and Carray are arrays of pointers to matrices stored in column-major format - with dimensions n*n and leading dimension lda and ldc respectively. - This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. - Prior to calling GetriBatched, the matrix A[i] must be factorized first using - the routine GetrfBatched. After the call of GetrfBatched, the matrix - pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector - pointing by (PivotArray+i) will contain the pivoting sequence. - Following the LU factorization, GetriBatched uses forward and backward - triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The - inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of - Array[i]. + Creates a new cudaBlas handler - number of rows and columns of Aarray[i]. - array of pointers to array, with each array of dimension n*n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. - array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. - If info=0, the execution is successful. - If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. - number of pointers contained in A - + - Aarray and Carray are arrays of pointers to matrices stored in column-major format - with dimensions n*n and leading dimension lda and ldc respectively. - This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. - Prior to calling GetriBatched, the matrix A[i] must be factorized first using - the routine GetrfBatched. After the call of GetrfBatched, the matrix - pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector - pointing by (PivotArray+i) will contain the pivoting sequence. - Following the LU factorization, GetriBatched uses forward and backward - triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The - inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of - Array[i]. + Creates a new cudaBlas handler - number of rows and columns of Aarray[i]. - array of pointers to array, with each array of dimension n*n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. - array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. - If info=0, the execution is successful. - If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. - number of pointers contained in A - + - Aarray and Carray are arrays of pointers to matrices stored in column-major format - with dimensions n*n and leading dimension lda and ldc respectively. - This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. - Prior to calling GetriBatched, the matrix A[i] must be factorized first using - the routine GetrfBatched. After the call of GetrfBatched, the matrix - pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector - pointing by (PivotArray+i) will contain the pivoting sequence. - Following the LU factorization, GetriBatched uses forward and backward - triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The - inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of - Array[i]. + Creates a new cudaBlas handler - number of rows and columns of Aarray[i]. - array of pointers to array, with each array of dimension n*n with lda>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. - array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. - If info=0, the execution is successful. - If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. - number of pointers contained in A - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + Creates a new cudaBlas handler - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + Creates a new cudaBlas handler - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + Creates a new cudaBlas handler - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + For dispose - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + Dispose - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + For IDisposable - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - + - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + Returns the wrapped cublas handle - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function solves an array of triangular linear systems with multiple right-hand-sides. - The solution overwrites the right-hand-sides on exit. - No test for singularity or near-singularity is included in this function. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + - indicates if matrix A[i] is on the left or right of X[i]. - indicates if matrix A[i] lower or upper part is stored, the - other part is not referenced and is inferred from the stored elements. - operation op(A[i]) that is non- or (conj.) transpose. - indicates if the elements on the main diagonal of matrix - A[i] are unity and should not be accessed. - number of rows of matrix B[i], with matrix A[i] sized accordingly. - number of columns of matrix B[i], with matrix A[i] is sized accordingly. - scalar used for multiplication, if alpha==0 then A[i] is not - referenced and B[i] does not have to be a valid input. - array of device pointers with each array/device pointerarray - of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with - lda>=max(1,n) otherwise. - leading dimension of two-dimensional array used to store matrix A[i]. - array of device pointers with each array/device pointerarrayof dim. - ldb x n with ldb>=max(1,m) - leading dimension of two-dimensional array used to store matrix B[i]. - - + - This function performs the conversion from the triangular packed format to the - triangular format. - If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the - lower triangular part of the triangular matrix A and the upper part of A is left untouched. - If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the - upper triangular part of the triangular matrix A and the lower part of A is left untouched. + - indicates if matrix AP contains lower or upper part of matrix A. - number of rows and columns of matrix A. - array with A stored in packed format. - array of dimensions lda x n , with lda>=max(1,n). The - opposite side of A is left untouched. - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the conversion from the triangular packed format to the - triangular format. - If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the - lower triangular part of the triangular matrix A and the upper part of A is left untouched. - If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the - upper triangular part of the triangular matrix A and the lower part of A is left untouched. + - indicates if matrix AP contains lower or upper part of matrix A. - number of rows and columns of matrix A. - array with A stored in packed format. - array of dimensions lda x n , with lda>=max(1,n). The - opposite side of A is left untouched. - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the conversion from the triangular packed format to the - triangular format. - If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the - lower triangular part of the triangular matrix A and the upper part of A is left untouched. - If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the - upper triangular part of the triangular matrix A and the lower part of A is left untouched. + - indicates if matrix AP contains lower or upper part of matrix A. - number of rows and columns of matrix A. - array with A stored in packed format. - array of dimensions lda x n , with lda>=max(1,n). The - opposite side of A is left untouched. - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the conversion from the triangular packed format to the - triangular format. - If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the - lower triangular part of the triangular matrix A and the upper part of A is left untouched. - If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the - upper triangular part of the triangular matrix A and the lower part of A is left untouched. + - indicates if matrix AP contains lower or upper part of matrix A. - number of rows and columns of matrix A. - array with A stored in packed format. - array of dimensions lda x n , with lda>=max(1,n). The - opposite side of A is left untouched. - leading dimension of two-dimensional array used to store matrix A. - + - This function performs the conversion from the triangular format to the triangular - packed format. - If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular - matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then - the upper triangular part of the triangular matrix A is copied into the array AP + - indicates which matrix A lower or upper part is referenced - number of rows and columns of matrix A. - array of dimensions lda x n , with lda>=max(1,n). - leading dimension of two-dimensional array used to store matrix A. - array with A stored in packed format. - + - This function performs the conversion from the triangular format to the triangular - packed format. - If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular - matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then - the upper triangular part of the triangular matrix A is copied into the array AP + - indicates which matrix A lower or upper part is referenced - number of rows and columns of matrix A. - array of dimensions lda x n , with lda>=max(1,n). - leading dimension of two-dimensional array used to store matrix A. - array with A stored in packed format. - + - This function performs the conversion from the triangular format to the triangular - packed format. - If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular - matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then - the upper triangular part of the triangular matrix A is copied into the array AP + This function copies the vector x into the vector y. - indicates which matrix A lower or upper part is referenced - number of rows and columns of matrix A. - array of dimensions lda x n , with lda>=max(1,n). - leading dimension of two-dimensional array used to store matrix A. - array with A stored in packed format. + + + + + + + - + - This function performs the conversion from the triangular format to the triangular - packed format. - If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular - matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then - the upper triangular part of the triangular matrix A is copied into the array AP + This function copies the vector x into the vector y. - indicates which matrix A lower or upper part is referenced - number of rows and columns of matrix A. - array of dimensions lda x n , with lda>=max(1,n). - leading dimension of two-dimensional array used to store matrix A. - array with A stored in packed format. + + + + - + - This function performs the QR factorization of each Aarray[i] for i = - 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented - as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. - cublas]]>geqrfBatched supports arbitrary dimension. - cublas]]>geqrfBatched only supports compute capability 2.0 or above. + This function copies the vector x into the vector y. - number of rows Aarray[i]. - number of columns of Aarray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of pointers to device vector, with each vector of dim. max(1,min(m,n)). - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + - + - This function performs the QR factorization of each Aarray[i] for i = - 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented - as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. - cublas]]>geqrfBatched supports arbitrary dimension. - cublas]]>geqrfBatched only supports compute capability 2.0 or above. + This function copies the vector x into the vector y. - number of rows Aarray[i]. - number of columns of Aarray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of pointers to device vector, with each vector of dim. max(1,min(m,n)). - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + - + - This function performs the QR factorization of each Aarray[i] for i = - 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented - as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. - cublas]]>geqrfBatched supports arbitrary dimension. - cublas]]>geqrfBatched only supports compute capability 2.0 or above. + This function copies the vector x into the vector y. - number of rows Aarray[i]. - number of columns of Aarray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of pointers to device vector, with each vector of dim. max(1,min(m,n)). - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + - + - This function performs the QR factorization of each Aarray[i] for i = - 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented - as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. - cublas]]>geqrfBatched supports arbitrary dimension. - cublas]]>geqrfBatched only supports compute capability 2.0 or above. + This function copies the vector x into the vector y. - number of rows Aarray[i]. - number of columns of Aarray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i]. - array of pointers to device vector, with each vector of dim. max(1,min(m,n)). - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + - + - This function find the least squares solution of a batch of overdetermined systems. - On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution - GelsBatched supports only the non-transpose operation and only solves overdetermined - systems (m >= n). - GelsBatched only supports compute capability 2.0 or above. - This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + This function copies the vector x into the vector y. - operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. - number of rows Aarray[i]. - number of columns of each Aarray[i] and rows of each Carray[i]. - number of columns of each Carray[i]. + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + + + + Cosine component + Sine component + + + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + + Cosine component + Sine component + + + + + + This function constructs the Givens rotation matrix G = |c s; -s c| that zeros out the second entry of a 2x1 vector (a; b)T + + + + Cosine component + Sine component + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + + + + + + This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector + [sqrt(d1)*x1; sqrt(d2)*y1]. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector + [sqrt(d1)*x1; sqrt(d2)*y1]. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector + [sqrt(d1)*x1; sqrt(d2)*y1]. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector + [sqrt(d1)*x1; sqrt(d2)*y1]. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + + + + + + + This function constructs the modified Givens transformation H = |h11 h12; h21 h22| that zeros out the second entry of a 2x1 vector + [sqrt(d1)*x1; sqrt(d2)*y1]. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + enumerant specifying the datatype of matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + enumerant specifying the datatype of matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + enumerant specifying the datatype of matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + enumerant specifying the datatype of matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + enumerant specifying the datatype of matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + enumerant specifying the datatype of matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + value of type long long int that gives the address offset between A[i] and A[i+1]. + value of type long long int that gives the address offset between B[i] and B[i+1]. + value of type long long int that gives the address offset between C[i] and C[i+1]. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% + C = a op(A ) op(B ) + C + where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A + op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C + and op(B ) is defined similarly for matrix B. + Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + value of type long long int that gives the address offset between A[i] and A[i+1]. + value of type long long int that gives the address offset between B[i] and B[i+1]. + value of type long long int that gives the address offset between C[i] and C[i+1]. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% + C = a op(A ) op(B ) + C + where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A + op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C + and op(B ) is defined similarly for matrix B. + Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long int that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long int that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long int that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the LU factorization of an array of n x n matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimension n to 32. + + number of rows and columns of A[i]. + array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix A[i]. + array of size n x batchSize that contains the permutation vector + of each factorization of A[i] stored in a linear fashion. + If info=0, the execution is successful. + If info = -i, the i-th parameter had an illegal value. + If info = i, aii is 0. The factorization has been completed, but U is exactly singular. + number of pointers contained in A + + + + This function performs the LU factorization of an array of n x n matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimension n to 32. + + number of rows and columns of A[i]. + array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix A[i]. + array of size n x batchSize that contains the permutation vector + of each factorization of A[i] stored in a linear fashion. + If info=0, the execution is successful. + If info = -i, the i-th parameter had an illegal value. + If info = i, aii is 0. The factorization has been completed, but U is exactly singular. + number of pointers contained in A + + + + This function performs the LU factorization of an array of n x n matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimension n to 32. + + number of rows and columns of A[i]. + array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix A[i]. + array of size n x batchSize that contains the permutation vector + of each factorization of A[i] stored in a linear fashion. + If info=0, the execution is successful. + If info = -i, the i-th parameter had an illegal value. + If info = i, aii is 0. The factorization has been completed, but U is exactly singular. + number of pointers contained in A + + + + This function performs the LU factorization of an array of n x n matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimension n to 32. + + number of rows and columns of A[i]. + array of device pointers with each array/device pointer of dim. n x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix A[i]. + array of size n x batchSize that contains the permutation vector + of each factorization of A[i] stored in a linear fashion. + If info=0, the execution is successful. + If info = -i, the i-th parameter had an illegal value. + If info = i, aii is 0. The factorization has been completed, but U is exactly singular. + number of pointers contained in A + + + + Aarray and Carray are arrays of pointers to matrices stored in column-major format + with dimensions n*n and leading dimension lda and ldc respectively. + This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. + Prior to calling GetriBatched, the matrix A[i] must be factorized first using + the routine GetrfBatched. After the call of GetrfBatched, the matrix + pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector + pointing by (PivotArray+i) will contain the pivoting sequence. + Following the LU factorization, GetriBatched uses forward and backward + triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The + inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of + Array[i]. + + number of rows and columns of Aarray[i]. + array of pointers to array, with each array of dimension n*n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. + array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. + If info=0, the execution is successful. + If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. + number of pointers contained in A + + + + Aarray and Carray are arrays of pointers to matrices stored in column-major format + with dimensions n*n and leading dimension lda and ldc respectively. + This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. + Prior to calling GetriBatched, the matrix A[i] must be factorized first using + the routine GetrfBatched. After the call of GetrfBatched, the matrix + pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector + pointing by (PivotArray+i) will contain the pivoting sequence. + Following the LU factorization, GetriBatched uses forward and backward + triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The + inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of + Array[i]. + + number of rows and columns of Aarray[i]. + array of pointers to array, with each array of dimension n*n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. + array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. + If info=0, the execution is successful. + If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. + number of pointers contained in A + + + + Aarray and Carray are arrays of pointers to matrices stored in column-major format + with dimensions n*n and leading dimension lda and ldc respectively. + This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. + Prior to calling GetriBatched, the matrix A[i] must be factorized first using + the routine GetrfBatched. After the call of GetrfBatched, the matrix + pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector + pointing by (PivotArray+i) will contain the pivoting sequence. + Following the LU factorization, GetriBatched uses forward and backward + triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The + inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of + Array[i]. + + number of rows and columns of Aarray[i]. + array of pointers to array, with each array of dimension n*n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. + array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. + If info=0, the execution is successful. + If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. + number of pointers contained in A + + + + Aarray and Carray are arrays of pointers to matrices stored in column-major format + with dimensions n*n and leading dimension lda and ldc respectively. + This function performs the inversion of matrices A[i] for i = 0, ..., batchSize-1. + Prior to calling GetriBatched, the matrix A[i] must be factorized first using + the routine GetrfBatched. After the call of GetrfBatched, the matrix + pointing by Aarray[i] will contain the LU factors of the matrix A[i] and the vector + pointing by (PivotArray+i) will contain the pivoting sequence. + Following the LU factorization, GetriBatched uses forward and backward + triangular solvers to complete inversion of matrices A[i] for i = 0, ..., batchSize-1. The + inversion is out-of-place, so memory space of Carray[i] cannot overlap memory space of + Array[i]. + + number of rows and columns of Aarray[i]. + array of pointers to array, with each array of dimension n*n with lda>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of size n*batchSize that contains the pivoting sequence of each factorization of Aarray[i] stored in a linear fashion. + array of pointers to array, with each array of dimension n*n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + array of size batchSize that info(=infoArray[i]) contains the information of inversion of A[i]. + If info=0, the execution is successful. + If info = k, U(k,k) is 0. The U is exactly singular and the inversion failed. + number of pointers contained in A + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function performs the conversion from the triangular packed format to the + triangular format. + If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the + lower triangular part of the triangular matrix A and the upper part of A is left untouched. + If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the + upper triangular part of the triangular matrix A and the lower part of A is left untouched. + + indicates if matrix AP contains lower or upper part of matrix A. + number of rows and columns of matrix A. + array with A stored in packed format. + array of dimensions lda x n , with lda>=max(1,n). The + opposite side of A is left untouched. + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the conversion from the triangular packed format to the + triangular format. + If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the + lower triangular part of the triangular matrix A and the upper part of A is left untouched. + If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the + upper triangular part of the triangular matrix A and the lower part of A is left untouched. + + indicates if matrix AP contains lower or upper part of matrix A. + number of rows and columns of matrix A. + array with A stored in packed format. + array of dimensions lda x n , with lda>=max(1,n). The + opposite side of A is left untouched. + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the conversion from the triangular packed format to the + triangular format. + If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the + lower triangular part of the triangular matrix A and the upper part of A is left untouched. + If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the + upper triangular part of the triangular matrix A and the lower part of A is left untouched. + + indicates if matrix AP contains lower or upper part of matrix A. + number of rows and columns of matrix A. + array with A stored in packed format. + array of dimensions lda x n , with lda>=max(1,n). The + opposite side of A is left untouched. + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the conversion from the triangular packed format to the + triangular format. + If uplo == CUBLAS_FILL_MODE_LOWER then the elements of AP are copied into the + lower triangular part of the triangular matrix A and the upper part of A is left untouched. + If uplo == CUBLAS_FILL_MODE_UPPER then the elements of AP are copied into the + upper triangular part of the triangular matrix A and the lower part of A is left untouched. + + indicates if matrix AP contains lower or upper part of matrix A. + number of rows and columns of matrix A. + array with A stored in packed format. + array of dimensions lda x n , with lda>=max(1,n). The + opposite side of A is left untouched. + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the conversion from the triangular format to the triangular + packed format. + If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular + matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then + the upper triangular part of the triangular matrix A is copied into the array AP + + indicates which matrix A lower or upper part is referenced + number of rows and columns of matrix A. + array of dimensions lda x n , with lda>=max(1,n). + leading dimension of two-dimensional array used to store matrix A. + array with A stored in packed format. + + + + This function performs the conversion from the triangular format to the triangular + packed format. + If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular + matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then + the upper triangular part of the triangular matrix A is copied into the array AP + + indicates which matrix A lower or upper part is referenced + number of rows and columns of matrix A. + array of dimensions lda x n , with lda>=max(1,n). + leading dimension of two-dimensional array used to store matrix A. + array with A stored in packed format. + + + + This function performs the conversion from the triangular format to the triangular + packed format. + If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular + matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then + the upper triangular part of the triangular matrix A is copied into the array AP + + indicates which matrix A lower or upper part is referenced + number of rows and columns of matrix A. + array of dimensions lda x n , with lda>=max(1,n). + leading dimension of two-dimensional array used to store matrix A. + array with A stored in packed format. + + + + This function performs the conversion from the triangular format to the triangular + packed format. + If uplo == CUBLAS_FILL_MODE_LOWER then the lower triangular part of the triangular + matrix A is copied into the array AP. If uplo == CUBLAS_FILL_MODE_UPPER then then + the upper triangular part of the triangular matrix A is copied into the array AP + + indicates which matrix A lower or upper part is referenced + number of rows and columns of matrix A. + array of dimensions lda x n , with lda>=max(1,n). + leading dimension of two-dimensional array used to store matrix A. + array with A stored in packed format. + + + + This function performs the QR factorization of each Aarray[i] for i = + 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented + as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + cublas]]>geqrfBatched supports arbitrary dimension. + cublas]]>geqrfBatched only supports compute capability 2.0 or above. + + number of rows Aarray[i]. + number of columns of Aarray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of pointers to device vector, with each vector of dim. max(1,min(m,n)). + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function performs the QR factorization of each Aarray[i] for i = + 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented + as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + cublas]]>geqrfBatched supports arbitrary dimension. + cublas]]>geqrfBatched only supports compute capability 2.0 or above. + + number of rows Aarray[i]. + number of columns of Aarray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of pointers to device vector, with each vector of dim. max(1,min(m,n)). + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function performs the QR factorization of each Aarray[i] for i = + 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented + as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + cublas]]>geqrfBatched supports arbitrary dimension. + cublas]]>geqrfBatched only supports compute capability 2.0 or above. + + number of rows Aarray[i]. + number of columns of Aarray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of pointers to device vector, with each vector of dim. max(1,min(m,n)). + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function performs the QR factorization of each Aarray[i] for i = + 0, ...,batchSize-1 using Householder reflections. Each matrix Q[i] is represented + as a product of elementary reflectors and is stored in the lower part of each Aarray[i]. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + cublas]]>geqrfBatched supports arbitrary dimension. + cublas]]>geqrfBatched only supports compute capability 2.0 or above. + + number of rows Aarray[i]. + number of columns of Aarray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i]. + array of pointers to device vector, with each vector of dim. max(1,min(m,n)). + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function find the least squares solution of a batch of overdetermined systems. + On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution + GelsBatched supports only the non-transpose operation and only solves overdetermined + systems (m >= n). + GelsBatched only supports compute capability 2.0 or above. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. + number of rows Aarray[i]. + number of columns of each Aarray[i] and rows of each Carray[i]. + number of columns of each Carray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i] + array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + null or optional array of integers of dimension batchsize. + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function find the least squares solution of a batch of overdetermined systems. + On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution + GelsBatched supports only the non-transpose operation and only solves overdetermined + systems (m >= n). + GelsBatched only supports compute capability 2.0 or above. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. + number of rows Aarray[i]. + number of columns of each Aarray[i] and rows of each Carray[i]. + number of columns of each Carray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i] + array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + null or optional array of integers of dimension batchsize. + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function find the least squares solution of a batch of overdetermined systems. + On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution + GelsBatched supports only the non-transpose operation and only solves overdetermined + systems (m >= n). + GelsBatched only supports compute capability 2.0 or above. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. + number of rows Aarray[i]. + number of columns of each Aarray[i] and rows of each Carray[i]. + number of columns of each Carray[i]. + array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. + leading dimension of two-dimensional array used to store each matrix Aarray[i] + array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix Carray[i]. + null or optional array of integers of dimension batchsize. + 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + + + + This function find the least squares solution of a batch of overdetermined systems. + On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution + GelsBatched supports only the non-transpose operation and only solves overdetermined + systems (m >= n). + GelsBatched only supports compute capability 2.0 or above. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. + number of rows Aarray[i]. + number of columns of each Aarray[i] and rows of each Carray[i]. + number of columns of each Carray[i]. array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. leading dimension of two-dimensional array used to store each matrix Aarray[i] array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). @@ -7935,178 +9858,6129 @@ null or optional array of integers of dimension batchsize. 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid - + + + This function solves an array of systems of linear equations of the form: + op(A[i]) X[i] = a B[i] + where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are + n x nrhs matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(A) that is non- or (conj.) transpose. + number of rows and columns of Aarray[i]. + number of columns of Barray[i]. + array of pointers to array, with each array of dim. n + x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store + each matrix Aarray[i]. + array of size n x batchSize that contains the pivoting + sequence of each factorization of Aarray[i] stored in a + linear fashion. If devIpiv is nil, pivoting for all Aarray[i] + is ignored. + array of pointers to array, with each array of dim. n + x nrhs with ldb>=max(1,n). + leading dimension of two-dimensional array used to store + each solution matrix Barray[i]. + number of pointers contained in A + If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + + + + This function solves an array of systems of linear equations of the form: + op(A[i]) X[i] = a B[i] + where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are + n x nrhs matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(A) that is non- or (conj.) transpose. + number of rows and columns of Aarray[i]. + number of columns of Barray[i]. + array of pointers to array, with each array of dim. n + x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store + each matrix Aarray[i]. + array of size n x batchSize that contains the pivoting + sequence of each factorization of Aarray[i] stored in a + linear fashion. If devIpiv is nil, pivoting for all Aarray[i] + is ignored. + array of pointers to array, with each array of dim. n + x nrhs with ldb>=max(1,n). + leading dimension of two-dimensional array used to store + each solution matrix Barray[i]. + number of pointers contained in A + If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + + + + This function solves an array of systems of linear equations of the form: + op(A[i]) X[i] = a B[i] + where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are + n x nrhs matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(A) that is non- or (conj.) transpose. + number of rows and columns of Aarray[i]. + number of columns of Barray[i]. + array of pointers to array, with each array of dim. n + x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store + each matrix Aarray[i]. + array of size n x batchSize that contains the pivoting + sequence of each factorization of Aarray[i] stored in a + linear fashion. If devIpiv is nil, pivoting for all Aarray[i] + is ignored. + array of pointers to array, with each array of dim. n + x nrhs with ldb>=max(1,n). + leading dimension of two-dimensional array used to store + each solution matrix Barray[i]. + number of pointers contained in A + If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + + + + This function solves an array of systems of linear equations of the form: + op(A[i]) X[i] = a B[i] + where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are + n x nrhs matrices. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. + + operation op(A) that is non- or (conj.) transpose. + number of rows and columns of Aarray[i]. + number of columns of Barray[i]. + array of pointers to array, with each array of dim. n + x n with lda>=max(1,n). + leading dimension of two-dimensional array used to store + each matrix Aarray[i]. + array of size n x batchSize that contains the pivoting + sequence of each factorization of Aarray[i] stored in a + linear fashion. If devIpiv is nil, pivoting for all Aarray[i] + is ignored. + array of pointers to array, with each array of dim. n + x nrhs with ldb>=max(1,n). + leading dimension of two-dimensional array used to store + each solution matrix Barray[i]. + number of pointers contained in A + If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + + + + copies elements from a vector hostSourceVector in CPU memory space to a vector devDestVector + in GPU memory space. Storage spacing between consecutive elements + is incrHostSource for the source vector hostSourceVector and incrDevDest for the destination vector + devDestVector. Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + Vector datatype + Source vector in host memory + + Destination vector in device memory + + + + + copies elements from a vector devSourceVector in GPU memory space to a vector hostDestVector + in CPU memory space. Storage spacing between consecutive elements + is incrHostDest for the source vector devSourceVector and incrDevSource for the destination vector + hostDestVector. Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + Vector datatype + Source vector in device memory + + Destination vector in host memory + + + + + copies a tile of rows x cols elements from a matrix hostSource in CPU memory + space to a matrix devDest in GPU memory space. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix hostSource provided in ldHostSource, and the leading dimension of matrix devDest + provided in ldDevDest. + + + + + + + + + + + + copies a tile of rows x cols elements from a matrix devSource in GPU memory + space to a matrix hostDest in CPU memory space. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix devSource provided in devSource, and the leading dimension of matrix hostDest + provided in ldHostDest. + + + + + + + + + + + + copies elements from a vector hostSourceVector in CPU memory space to a vector devDestVector + in GPU memory space. Storage spacing between consecutive elements + is incrHostSource for the source vector hostSourceVector and incrDevDest for the destination vector + devDestVector. Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + Vector datatype + Source vector in host memory + + Destination vector in device memory + + + + + + copies elements from a vector devSourceVector in GPU memory space to a vector hostDestVector + in CPU memory space. Storage spacing between consecutive elements + is incrHostDest for the source vector devSourceVector and incrDevSource for the destination vector + hostDestVector. Column major format for two-dimensional matrices + is assumed throughout CUBLAS. Therefore, if the increment for a vector + is equal to 1, this access a column vector while using an increment + equal to the leading dimension of the respective matrix accesses a + row vector. + + Vector datatype + Source vector in device memory + + Destination vector in host memory + + + + + + copies a tile of rows x cols elements from a matrix hostSource in CPU memory + space to a matrix devDest in GPU memory space. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix hostSource provided in ldHostSource, and the leading dimension of matrix devDest + provided in ldDevDest. + + + + + + + + + + + + + copies a tile of rows x cols elements from a matrix devSource in GPU memory + space to a matrix hostDest in CPU memory space. Both matrices are assumed to be stored in column + major format, with the leading dimension (i.e. number of rows) of + source matrix devSource provided in devSource, and the leading dimension of matrix hostDest + provided in ldHostDest. + + + + + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function copies the vector x into the vector y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + This function interchanges the elements of vector x and y. + + + + + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + This function computes the Euclidean norm of the vector x. + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + This function computes the dot product of vectors x and y. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + This function computes the dot product of vectors x and y. + Notice that the conjugate of the element of vector x should be used. + + + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function scales the vector x by the scalar and overwrites it with the result. + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function multiplies the vector x by the scalar and adds it to the vector y overwriting + the latest vector with the result. + + + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the minimum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + This function finds the (smallest) index of the element of the maximum magnitude. + First index starts at 1 (Fortran notation) + + + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + This function computes the sum of the absolute values of the elements of vector x. + + + + + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + Cosine component + Sine component + + + + This function applies Givens rotation matrix G = |c s; -s c| to vectors x and y. + + + + + + + + + Cosine component + Sine component + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + This function applies the modified Givens transformation H = |h11 h12; h21 h22| to vectors x and y. + The elements h11, h21, h12 and h22 of 2x2 matrix H are stored in param[1], param[2], param[3] and param[4], respectively. + The flag = param[0] defines the following predefined values for the matrix H entries: + flag=-1.0: H = |h11 h12; h21 h22| + flag= 0.0: H = |1.0 h12; h21 1.0| + flag= 1.0: H = |h11 1.0; -1.0 h22| + flag=-2.0: H = |1.0 0.0; 0.0 1.0| + Notice that the values -1.0, 0.0 and 1.0 implied by the flag are not stored in param. + + + + + + + + + + + + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in + lower or upper mode with or without the main diagonal, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular banded matrix-vector multiplication x= Op(A) x where A is a triangular banded matrix, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the triangular packed matrix-vector multiplication x= Op(A) x where A is a triangular matrix stored in packed format, and x is a vector. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular linear system with a single right-hand-side Op(A)x = b where A is a triangular matrix stored in lower or + upper mode with or without the main diagonal, and x and b are vectors. The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the packed triangular linear system with a single right-hand-side Op(A) x = b where A is a triangular matrix stored in packed format, and x and b are vectors. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function solves the triangular banded linear system with a single right-hand-side Op(A) x = b where A is a triangular banded matrix, and x and b is a vector. + The solution x overwrites the right-hand-sides b on exit. n is given by x.Size. No test for singularity or near-singularity is included in this function. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of sub- and super-diagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + scalar used for multiplication. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the matrix-vector multiplication y = alpha * Op(A) * x + beta * y where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha and beta are scalars. + + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix A. + number of columns of matrix A. + number of subdiagonals of matrix A. + number of superdiagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric matrix-vector multiplication y = alpha *A * x + beta * y where A is a n*n symmetric matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in lower or upper mode, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric banded matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix with k subdiagonals and superdiagonals, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of sub- and super-diagonals of matrix A. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the symmetric packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n symmetric matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the Hermitian packed matrix-vector multiplication y = alpha * A * x + beta * y where A is a n*n Hermitian matrix stored in packed format, + x and y are vectors, and alpha and beta are scalars. n is given by x.Size. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + array of dimensions lda * n, with lda >= max(1,n). + vector with n elements. + stride between consecutive elements of x. + scalar used for multiplication, if beta==0 then y does not have to be a valid input. + vector with n elements. + stride between consecutive elements of y. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^T + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the rank-1 update A = alpha * x * y^H + A where A is a m*n matrix stored in column-major format, + x and y are vectors, and alpha is a scalar. m = x.Size, n = y.Size. + + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-1 update A = alpha * x * x^T + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the Hermitian rank-1 update A = alpha * x * x^H + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the symmetric rank-2 update A = alpha * (x * y^T + y * y^T) + A where A is a n*n symmetric Matrix stored in column-major format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of y. + array of dimensions lda * n, with lda >= max(1,n). + leading dimension of two-dimensional array used to store matrix A. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed symmetric rank-2 update A = alpha * (x * y^T + y * x^T) + A where A is a n*n symmetric Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size = y.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the packed Hermitian rank-2 update A = alpha * (x * y^H + y * x^H) + A where A is a n*n Hermitian Matrix stored in packed format, + x is a vector, and alpha is a scalar. n is given by x.Size. + + indicates if matrix A lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + scalar used for multiplication. + vector with n elements. + stride between consecutive elements of x. + vector with n elements. + stride between consecutive elements of x. + array with A stored in packed format. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + enumerant specifying the datatype of matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + enumerant specifying the datatype of matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + enumerant specifying the datatype of matrix C. + + + + This function performs the matrix-matrix multiplication C = alpha * Op(A) * Op(B) + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*k, op(B) k*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + enumerant specifying the datatype of matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + enumerant specifying the datatype of matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + enumerant specifying the datatype of matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * Op(A)*Op(A)^T + beta * C where + alpha and beta are scalars, and A, B and C are matrices stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * Op(A)*Op(A)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric rank-k update C = alpha * (Op(A)*Op(B)^T + Op(B)*Op(A)^T) + beta * C where + alpha and beta are scalars, and C is a symmetrux matrix stored in lower or upper mode, and A and B are matrices with dimensions Op(A) n*k + and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian rank-k update C = alpha * (Op(A)*Op(B)^H + Op(B)*Op(A)^H) + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A is a matrix with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of columns of matrix op(B) and C. + number of columns of op(A) and rows of op(B). + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * k. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the symmetric rank- update C = alpha * (Op(A)*Op(B))^T + beta * C where alpha + and beta are scalars, C is a symmetric matrix stored in lower or upper mode, and A + and B are matrices with dimensions op(A) n*k and op(B) n*k, respectively. + + indicates if matrix C lower or upper part, is stored, the other symmetric part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication, if beta==0, then C does not have to be a valid input. + array of dimensions ldc x n with ldc>=max(1,n). + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs a variation of the Hermitian rank-k update C = alpha * Op(A) * Op(B)^H + beta * C where + alpha and beta are scalars, and C is a Hermitian matrix stored in lower or upper mode, and A and B are matrices with dimensions op(A) n*k and Op(B) n*k, respectively. + + indicates if matrix A lower or upper part is stored, the other Hermitian part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + number of rows of matrix op(A), op(B) and C. + number of columns of matrix op(A) and op(B). + scalar used for multiplication. + array of dimension lda x k with lda>=max(1,n) if transa == CUBLAS_OP_N and lda x n with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix A. + array of dimension ldb x k with ldb>=max(1,n) if transa == CUBLAS_OP_N and ldb x n with ldb>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store matrix B. + real scalar used for multiplication, if beta==0 then C does not have to be a valid input. + array of dimension ldc x n, with ldc>=max(1,n). The imaginary parts of the diagonal elements are assumed and set to zero. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the symmetric matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a symmetric matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the Hermitian matrix-matrix multiplication C = alpha*A*B + beta*C if side==SideMode.Left or C = alpha*B*A + beta*C if side==SideMode.Right + where A is a Hermitian matrix stored in lower or upper mode, B and C are m*n matrices, and alpha and beta are scalars. + + indicates if matrix A is on the left or right of B. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + number of rows of matrix C and B, with matrix A sized accordingly. + number of columns of matrix C and B, with matrix A sized accordingly. + scalar used for multiplication. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function solves the triangular linear system with multiple right-hand-sides Op(A)X = alpha*B side==SideMode.Left or XOp(A) = alpha*B if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the maindiagonal, X and B are m*n matrices, and alpha is a scalar. + The solution X overwrites the right-hand-sides B on exit. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the triangular matrix-matrix multiplication C = alpha*Op(A) * B if side==SideMode.Left or C = alpha*B * Op(A) if side==SideMode.Right + where A is a triangular matrix stored in lower or upper mode with or without the main diagonal, B and C are m*n matrices, and alpha is a scalar. + Notice that in order to achieve better parallelism CUBLAS differs from the BLAS API only for this routine. The BLAS API assumes an in-place implementation (with results + written back to B), while the CUBLAS API assumes an out-of-place implementation (with results written into C). The application can obtain the in-place functionality of BLAS in + the CUBLAS API by passing the address of the matrix B in place of the matrix C. No other overlapping in the input parameters is supported. + + indicates if matrix A is on the left or right of X. + indicates if matrix A lower or upper part is stored, the other part is not referenced and is inferred from the stored elements. + operation op(A) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix A are unity and should not be accessed. + number of rows of matrix B, with matrix A sized accordingly. + number of columns of matrix B, with matrix A sized accordingly. + scalar used for multiplication. If alpha==0 then A is not referenced and B does not have to be a valid input. + array of dimensions lda * m. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + array of dimensions ldc * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix addition/transposition C = alpha * Op(A) + beta * Op(B) where + alpha and beta are scalars, and A, B and C are matrices stored in column-major format with dimensions + op(A) m*n, op(B) m*n and C m*n, respectively. + + operation op(A) that is non- or (conj.) transpose. + operation op(B) that is non- or (conj.) transpose. + number of rows of matrix op(A) and C. + number of columns of matrix op(B) and C. + scalar used for multiplication. + array of dimensions lda * k. + leading dimension of two-dimensional array used to store matrix A. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix B. + scalar used for multiplication. + array of dimensions ldb * n. + leading dimension of two-dimensional array used to store matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplication C = A x diag(X) if mode == CUBLAS_SIDE_RIGHT, or + C = diag(X) x A if mode == CUBLAS_SIDE_LEFT. + where A and C are matrices stored in column-major format with dimensions m*n. X is a + vector of size n if mode == CUBLAS_SIDE_RIGHT and of size m if mode == + CUBLAS_SIDE_LEFT. X is gathered from one-dimensional array x with stride incx. The + absolute value of incx is the stride and the sign of incx is direction of the stride. If incx + is positive, then we forward x from the first element. Otherwise, we backward x from the + last element. + + left multiply if mode == CUBLAS_SIDE_LEFT + or right multiply if mode == CUBLAS_SIDE_RIGHT + number of rows of matrix A and C. + number of columns of matrix A and C. + array of dimensions lda x n with lda >= max(1,m) + leading dimension of two-dimensional array used to store the matrix A. + one-dimensional array of size |incx|*m + if mode == CUBLAS_SIDE_LEFT and |incx|*n + if mode == CUBLAS_SIDE_RIGHT + stride of one-dimensional array x. + array of dimensions ldc*n with ldc >= max(1,m). + leading dimension of a two-dimensional array used to store the matrix C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + value of type long long long that gives the address offset between A[i] and A[i+1]. + value of type long long long that gives the address offset between B[i] and B[i+1]. + value of type long long long that gives the address offset between C[i] and C[i+1]. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% + C = a op(A ) op(B ) + C + where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A + op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C + and op(B ) is defined similarly for matrix B. + Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + value of type long long long that gives the address offset between A[i] and A[i+1]. + value of type long long long that gives the address offset between B[i] and B[i+1]. + value of type long long long that gives the address offset between C[i] and C[i+1]. + pointer to ]]> matrix, A, corresponds to the first instance of the batch of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to ]]> matrix, A, corresponds to the first instance of the batch. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + + + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the complex matrix-matrix multiplication, using Gauss complexity reduction algorithm. This can lead to an increase in performance up to 25% + C = a op(A ) op(B ) + C + where a and b are scalars, and A , B and C are matrices stored in column-major format with dimensions op(A ) m k, op ( B ) k n and C m n, respectively. Also, for matrix A + op(A ) = A if transa == CUBLAS_OP_N A T if transa == CUBLAS_OP_T A H if transa == CUBLAS_OP_C + and op(B ) is defined similarly for matrix B. + Note: These 2 routines are only supported on GPUs with architecture capabilities equal or greater than 5.0 + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplications of an array of matrices. + where and are scalars, and , and are arrays of pointers to matrices stored + in column-major format with dimensions op(A[i])m x k, op(B[i])k x n and op(C[i])m x n, + respectively. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. For small sizes, typically smaller than 100x100, + this function improves significantly performance compared to making calls to its + corresponding cublas]]>gemm routine. However, on GPU architectures that support + concurrent kernels, it might be advantageous to make multiple calls to cublas]]>gemm + into different streams as the matrix sizes increase. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + array of device pointers, with each array/device pointer of dim. lda x k with lda>=max(1,m) if + transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + array of device pointers, with each array of dim. ldb x n with ldb>=max(1,k) if + transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + array of device pointers. It has dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + number of pointers contained in A, B and C. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + - This function find the least squares solution of a batch of overdetermined systems. - On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution - GelsBatched supports only the non-transpose operation and only solves overdetermined - systems (m >= n). - GelsBatched only supports compute capability 2.0 or above. + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication of a batch of matrices. The batch is considered to be "uniform", + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. Input matrices A, B and output matrix C for each instance of the batch are located + at fixed address offsets from their locations in the previous instance. Pointers to A, B and C matrices for the first + instance are passed to the function by the user along with the address offsets - strideA, strideB and strideC that determine + the locations of input and output matrices in future instances. + + operation op(A[i]) that is non- or (conj.) transpose. + operation op(B[i]) that is non- or (conj.) transpose. + number of rows of matrix op(A[i]) and C[i]. + number of columns of op(B[i]) and C[i]. + number of columns of op(A[i]) and rows of op(B[i]). + scalar used for multiplication. + pointer to the A matrix corresponding to the first instance of the batch, with dimensions lda x k with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x m with lda>=max(1,k) otherwise. + leading dimension of two-dimensional array used to store each matrix A[i]. + Value of type long long long that gives the address offset between A[i] and A[i+1] + pointer to the B matrix corresponding to the first instance of the batch, with dimensions ldb x n with ldb>=max(1,k) if transa==CUBLAS_OP_N and ldb x k with ldb>=max(1,n) max(1,) otherwise. + leading dimension of two-dimensional array used to store each matrix B[i]. + Value of type long long long that gives the address offset between B[i] and B[i+1] + scalar used for multiplication. If beta == 0, C does not have to be a valid input. + pointer to the C matrix corresponding to the first instance of the batch, with dimensions ldc x n with ldc>=max(1,m). + leading dimension of two-dimensional array used to store each matrix C[i]. + Value of type long long long that gives the address offset between C[i] and C[i+1] + number of GEMMs to perform in the batch. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function performs the matrix-matrix multiplication on groups of matrices. A given group is considered to be “uniform”, + i.e. all instances have the same dimensions (m, n, k), leading dimensions (lda, ldb, ldc) and transpositions (transa, transb) + for their respective A, B and C matrices. However, the dimensions, leading dimensions, transpositions, and scaling factors + (alpha, beta) may vary between groups. The address of the input matrices and the output matrix of each instance of the batch + are read from arrays of pointers passed to the function by the caller. + + array containing the operations, op(A[idx]), that is non- or (conj.) transpose for each group. + array containing the operations, op(B[idx]), that is non- or (conj.) transpose for each group. + array containing the number of rows of matrix op(A[idx]) and C[idx] for each group. + array containing the number of columns of op(B[idx]) and C[idx] for each group. + array containing the number of columns of op(A[idx]) and rows of op(B[idx]) for each group. + array containing the scalar used for multiplication for each group. + array of device pointers, with each array/device pointer of dim. lda[i] x k[i] with lda[i]>=max(1,m[i]) if transa[i]==CUBLAS_OP_N and lda[i] x m[i] with lda[i]>=max(1,k[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + Enumerant specifying the datatype of A. + array containing the leading dimensions of two-dimensional arrays used to store each matrix A[idx] for each group. + array of device pointers, with each array of dim. ldb[i] x n[i] with ldb[i]>=max(1,k[i]) if transb[i]==CUBLAS_OP_N and ldb[i] x k[i] with ldb[i]>=max(1,n[i]) otherwise. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix B[idx] for each group. + array containing the scalar used for multiplication for each group. + Enumerant specifying the datatype of B. + array of device pointers. It has dimensions ldc[i] x n[i] with ldc[i]>=max(1,m[i]). Matrices C[idx] should not overlap; otherwise, undefined behavior is expected. + All pointers must meet certain alignment criteria.Please see below for details. + array containing the leading dimensions of two-dimensional arrays used to store each matrix C[idx] for each group. + Enumerant specifying the datatype of C. + number of groups. + array containg the number of pointers contained in Aarray, Barray and Carray for each group. + Enumerant specifying the computation type. + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. - number of rows Aarray[i]. - number of columns of each Aarray[i] and rows of each Carray[i]. - number of columns of each Carray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i] - array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - null or optional array of integers of dimension batchsize. - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function find the least squares solution of a batch of overdetermined systems. - On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution - GelsBatched supports only the non-transpose operation and only solves overdetermined - systems (m >= n). - GelsBatched only supports compute capability 2.0 or above. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. - number of rows Aarray[i]. - number of columns of each Aarray[i] and rows of each Carray[i]. - number of columns of each Carray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i] - array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - null or optional array of integers of dimension batchsize. - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function find the least squares solution of a batch of overdetermined systems. - On exit, each Aarray[i] is overwritten with their QR factorization and each Carray[i] is overwritten with the least square solution - GelsBatched supports only the non-transpose operation and only solves overdetermined - systems (m >= n). - GelsBatched only supports compute capability 2.0 or above. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(Aarray[i]) that is non- or (conj.) transpose. Only non-transpose operation is currently supported. - number of rows Aarray[i]. - number of columns of each Aarray[i] and rows of each Carray[i]. - number of columns of each Carray[i]. - array of pointers to device array, with each array of dim. m x n with lda>=max(1,m). The array size determines the number of batches. - leading dimension of two-dimensional array used to store each matrix Aarray[i] - array of pointers to device array, with each array of dim. m x n with ldc>=max(1,m). - leading dimension of two-dimensional array used to store each matrix Carray[i]. - null or optional array of integers of dimension batchsize. - 0, if the parameters passed to the function are valid, <0, if the parameter in postion -value is invalid + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + + + + + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. + This function is intended to be used for matrices of small sizes where the launch + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. + + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function solves an array of systems of linear equations of the form: - op(A[i]) X[i] = a B[i] - where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are - n x nrhs matrices. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(A) that is non- or (conj.) transpose. - number of rows and columns of Aarray[i]. - number of columns of Barray[i]. - array of pointers to array, with each array of dim. n - x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store - each matrix Aarray[i]. - array of size n x batchSize that contains the pivoting - sequence of each factorization of Aarray[i] stored in a - linear fashion. If devIpiv is nil, pivoting for all Aarray[i] - is ignored. - array of pointers to array, with each array of dim. n - x nrhs with ldb>=max(1,n). - leading dimension of two-dimensional array used to store - each solution matrix Barray[i]. - number of pointers contained in A - If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function solves an array of systems of linear equations of the form: - op(A[i]) X[i] = a B[i] - where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are - n x nrhs matrices. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(A) that is non- or (conj.) transpose. - number of rows and columns of Aarray[i]. - number of columns of Barray[i]. - array of pointers to array, with each array of dim. n - x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store - each matrix Aarray[i]. - array of size n x batchSize that contains the pivoting - sequence of each factorization of Aarray[i] stored in a - linear fashion. If devIpiv is nil, pivoting for all Aarray[i] - is ignored. - array of pointers to array, with each array of dim. n - x nrhs with ldb>=max(1,n). - leading dimension of two-dimensional array used to store - each solution matrix Barray[i]. - number of pointers contained in A - If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function solves an array of systems of linear equations of the form: - op(A[i]) X[i] = a B[i] - where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are - n x nrhs matrices. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(A) that is non- or (conj.) transpose. - number of rows and columns of Aarray[i]. - number of columns of Barray[i]. - array of pointers to array, with each array of dim. n - x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store - each matrix Aarray[i]. - array of size n x batchSize that contains the pivoting - sequence of each factorization of Aarray[i] stored in a - linear fashion. If devIpiv is nil, pivoting for all Aarray[i] - is ignored. - array of pointers to array, with each array of dim. n - x nrhs with ldb>=max(1,n). - leading dimension of two-dimensional array used to store - each solution matrix Barray[i]. - number of pointers contained in A - If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + - This function solves an array of systems of linear equations of the form: - op(A[i]) X[i] = a B[i] - where A[i] is a matrix which has been LU factorized with pivoting, X[i] and B[i] are - n x nrhs matrices. + This function solves an array of triangular linear systems with multiple right-hand-sides. + The solution overwrites the right-hand-sides on exit. + No test for singularity or near-singularity is included in this function. This function is intended to be used for matrices of small sizes where the launch - overhead is a significant factor. + overhead is a significant factor. The current implementation limits the dimensions m and n to 32. - operation op(A) that is non- or (conj.) transpose. - number of rows and columns of Aarray[i]. - number of columns of Barray[i]. - array of pointers to array, with each array of dim. n - x n with lda>=max(1,n). - leading dimension of two-dimensional array used to store - each matrix Aarray[i]. - array of size n x batchSize that contains the pivoting - sequence of each factorization of Aarray[i] stored in a - linear fashion. If devIpiv is nil, pivoting for all Aarray[i] - is ignored. - array of pointers to array, with each array of dim. n - x nrhs with ldb>=max(1,n). - leading dimension of two-dimensional array used to store - each solution matrix Barray[i]. - number of pointers contained in A - If info=0, the execution is successful. If info = -j, the j-th parameter had an illegal value. + indicates if matrix A[i] is on the left or right of X[i]. + indicates if matrix A[i] lower or upper part is stored, the + other part is not referenced and is inferred from the stored elements. + operation op(A[i]) that is non- or (conj.) transpose. + indicates if the elements on the main diagonal of matrix + A[i] are unity and should not be accessed. + number of rows of matrix B[i], with matrix A[i] sized accordingly. + number of columns of matrix B[i], with matrix A[i] is sized accordingly. + scalar used for multiplication, if alpha==0 then A[i] is not + referenced and B[i] does not have to be a valid input. + array of device pointers with each array/device pointerarray + of dim. lda x m with lda>=max(1,m) if transa==CUBLAS_OP_N and lda x n with + lda>=max(1,n) otherwise. + leading dimension of two-dimensional array used to store matrix A[i]. + array of device pointers with each array/device pointerarrayof dim. + ldb x n with ldb>=max(1,m) + leading dimension of two-dimensional array used to store matrix B[i]. + - + copies elements from a vector hostSourceVector in CPU memory space to a vector devDestVector in GPU memory space. Storage spacing between consecutive elements @@ -8123,7 +15997,7 @@ Destination vector in device memory - + copies elements from a vector devSourceVector in GPU memory space to a vector hostDestVector in CPU memory space. Storage spacing between consecutive elements @@ -8140,7 +16014,7 @@ Destination vector in host memory - + copies a tile of rows x cols elements from a matrix hostSource in CPU memory space to a matrix devDest in GPU memory space. Both matrices are assumed to be stored in column @@ -8156,7 +16030,7 @@ - + copies a tile of rows x cols elements from a matrix devSource in GPU memory space to a matrix hostDest in CPU memory space. Both matrices are assumed to be stored in column @@ -8172,7 +16046,7 @@ - + copies elements from a vector hostSourceVector in CPU memory space to a vector devDestVector in GPU memory space. Storage spacing between consecutive elements @@ -8190,7 +16064,7 @@ - + copies elements from a vector devSourceVector in GPU memory space to a vector hostDestVector in CPU memory space. Storage spacing between consecutive elements @@ -8208,7 +16082,7 @@ - + copies a tile of rows x cols elements from a matrix hostSource in CPU memory space to a matrix devDest in GPU memory space. Both matrices are assumed to be stored in column @@ -8225,7 +16099,7 @@ - + copies a tile of rows x cols elements from a matrix devSource in GPU memory space to a matrix hostDest in CPU memory space. Both matrices are assumed to be stored in column @@ -8315,6 +16189,11 @@ the upper part of the matrix is filled + + + Full + + The DiagType type indicates whether the main diagonal of the dense matrix is @@ -8374,6 +16253,16 @@ the conjugate transpose operation is selected + + + synonym of ConjugateTranspose + + + + + the conjugate operation is selected + + The PointerMode type indicates whether the scalar values are passed by @@ -8602,6 +16491,85 @@ + + + same as using matching _PEDANTIC compute type when using cublas routine calls or cublasEx() calls with cudaDataType as compute type + + + + + allow accelerating single precision routines using TF32 tensor cores + + + + + flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines with lower size output type + + + + + Enum for compute type + - default types provide best available performance using all available hardware features + and guarantee internal storage precision with at least the same precision and range; + - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format; + - _FAST types allow for some loss of precision to enable higher throughput arithmetic. + + + + + half - default + + + + + half - pedantic + + + + + float - default + + + + + float - pedantic + + + + + float - fast, allows down-converting inputs to half or TF32 + + + + + float - fast, allows down-converting inputs to bfloat16 or TF32 + + + + + float - fast, allows down-converting inputs to TF32 + + + + + double - default + + + + + double - pedantic + + + + + signed 32-bit int - default + + + + + signed 32-bit int - pedantic + + The cublasDataType_t type is an enumerant to specify the data precision. It is used diff --git a/src/external/ManagedCuda/CudaBlas.dll b/src/external/ManagedCuda/CudaBlas.dll index 8787bc7e..636cf5cc 100644 Binary files a/src/external/ManagedCuda/CudaBlas.dll and b/src/external/ManagedCuda/CudaBlas.dll differ diff --git a/src/external/ManagedCuda/CudaDNN.XML b/src/external/ManagedCuda/CudaDNN.XML index 63dbd98d..6fcc2b3e 100644 --- a/src/external/ManagedCuda/CudaDNN.XML +++ b/src/external/ManagedCuda/CudaDNN.XML @@ -55,6 +55,101 @@ mode is set to CUDNN_ACTIVATION_CLIPPED_RELU or to specify the alpha coefficient when the activation mode is set to CUDNN_ACTIVATION_ELU. + + + + + + + + An opaque structure holding the description of an activation operation. + + + + + An opaque structure holding the description of an activation operation. + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handles. + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + An opaque structure holding the @@ -200,6 +295,11 @@ This function allows the user to specify the number of groups to be used in the associated convolution. + + + This function allows the user to specify the number of groups to be used in the associated convolution. + + @@ -236,6 +336,11 @@ Math precision. + + + + + This function returns the ctc costs and gradients, given the probabilities and labels. @@ -282,6 +387,16 @@ Amount of GPU memory needed as workspace to be able to execute the CTC loss computation with the specified algo. + + + + + + + + + + An opaque structure holding the cuDNN library context. @@ -798,6 +913,44 @@ Pointer to data of the tensor described by the dxDesc descriptor. Data pointer to GPU memory used by this function. It is expected that contents of reserveSpace doe not change between cudnnDropoutForward and cudnnDropoutBackward calls. + + + This function attempts all available cuDNN algorithms for cudnnConvolutionForward, using + user-allocated GPU memory, and outputs performance metrics to a user-allocated array of + cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first + element has the lowest compute time. The workspace size should be the largest workspace you + can spare in device memory; the size of this workspace will determine the availablity of + the convolution algorithms. + + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + Handle to a previously initialized filter descriptor. + Data pointer to GPU memory associated with the filter descriptor wDesc. + Previously initialized convolution descriptor. + Handle to the previously initialized output tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor yDesc. The content of this tensor will be overwritten with arbitary values. + The maximum number of elements to be stored in perfResults. + Data pointer to GPU memory that is a necessary workspace for some algorithms. The size of this workspace will determine the availability of algorithms. A nil pointer is considered a workSpace of 0 bytes. + + + + This function attempts all cuDNN algorithms for cudnnConvolutionBackwardFilter, + using user-allocated GPU memory, and outputs performance metrics to a + user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are + written in sorted fashion where the first element has the lowest compute time. The + workspace size should be the largest workspace you can spare in device memory; the + size of this workspace will determine the availablity of convolution algorithms. + + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the filter descriptor xDesc. + Handle to the previously initialized input differential tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor dyDesc. + Previously initialized convolution descriptor. + Handle to a previously initialized filter descriptor. + Data pointer to GPU memory associated with the filter descriptor dwDesc.The content of this tensor will be overwritten with arbitary values. + The maximum number of elements to be stored in perfResults. + Data pointer to GPU memory that is a necessary workspace for some algorithms. The size of this workspace will determine the availabilty of algorithms. A nil pointer is considered a workSpace of 0 bytes. + This function copies the scaled data from one tensor to another tensor with a different @@ -1261,6 +1414,44 @@ Pointer to data of the tensor described by the dxDesc descriptor. Data pointer to GPU memory used by this function. It is expected that contents of reserveSpace doe not change between cudnnDropoutForward and cudnnDropoutBackward calls. + + + This function attempts all available cuDNN algorithms for cudnnConvolutionForward, using + user-allocated GPU memory, and outputs performance metrics to a user-allocated array of + cudnnConvolutionFwdAlgoPerf_t. These metrics are written in sorted fashion where the first + element has the lowest compute time. The workspace size should be the largest workspace you + can spare in device memory; the size of this workspace will determine the availablity of + the convolution algorithms. + + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + Handle to a previously initialized filter descriptor. + Data pointer to GPU memory associated with the filter descriptor wDesc. + Previously initialized convolution descriptor. + Handle to the previously initialized output tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor yDesc. The content of this tensor will be overwritten with arbitary values. + The maximum number of elements to be stored in perfResults. + Data pointer to GPU memory that is a necessary workspace for some algorithms. The size of this workspace will determine the availability of algorithms. A nil pointer is considered a workSpace of 0 bytes. + + + + This function attempts all cuDNN algorithms for cudnnConvolutionBackwardFilter, + using user-allocated GPU memory, and outputs performance metrics to a + user-allocated array of cudnnConvolutionBwdFilterAlgoPerf_t. These metrics are + written in sorted fashion where the first element has the lowest compute time. The + workspace size should be the largest workspace you can spare in device memory; the + size of this workspace will determine the availablity of convolution algorithms. + + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the filter descriptor xDesc. + Handle to the previously initialized input differential tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor dyDesc. + Previously initialized convolution descriptor. + Handle to a previously initialized filter descriptor. + Data pointer to GPU memory associated with the filter descriptor dwDesc.The content of this tensor will be overwritten with arbitary values. + The maximum number of elements to be stored in perfResults. + Data pointer to GPU memory that is a necessary workspace for some algorithms. The size of this workspace will determine the availabilty of algorithms. A nil pointer is considered a workSpace of 0 bytes. + @@ -1554,6 +1745,21 @@ cudnnSetDropoutDescriptor. Pointer to GPU memory that holds random number generator states initialized by a prior call to cudnnSetDropoutDescriptor. + + + Helper function to calculate folding descriptors for dgrad + + + + + Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances + + + + + Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances + + An CudaDNNException is thrown, if any wrapped call to the cudnn-library does not return . @@ -2348,7 +2554,7 @@ A user-allocated array to store performance metrics sorted ascending by compute time. - + This function attempts all available cuDNN algorithms for cudnnConvolutionForward, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of @@ -2614,7 +2820,7 @@ The number of output elements stored in perfResults. A user-allocated array to store performance metrics sorted ascending by compute time. - + This function attempts all cuDNN algorithms for cudnnConvolutionBackwardFilter, using user-allocated GPU memory, and outputs performance metrics to a @@ -3852,7 +4058,7 @@ - + This function computes the gradient of a sampling operation. @@ -3871,7 +4077,7 @@ Pointer to scaling factor (in host memory) used to blend the gradient outputs dgrid with prior value in the destination pointer as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. Data pointer to GPU memory contains the output differential data. - + @@ -4550,6 +4756,22 @@ + + Create a destination descriptor for cudnnTransformTensor + + + Create an empty tensor transform descriptor + + + Initialize a previously created tensor transform descriptor. + + + Retrieves the values stored in a previously initialized tensor transform + descriptor. + + + Destroys a previously created tensor transform descriptor. + Constants for LRN, #define in cudnn.h @@ -4585,6 +4807,21 @@ MinEpsilon = 1e-5 + + + Constant values for SEQDATA + + + + + dimension count + + + + + Number of attention weight/bias tensors + + @@ -4918,6 +5155,41 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CUDNN return codes @@ -5990,89 +6262,544 @@ - + - An opaque structure holding the - description of a generic n-D dataset. + CUDNN Reorder - + + Fold/unfold transforms - + - For dispose + do batch normalization only - + - Dispose + do batchNorm, then activation - + - For IDisposable + do batchNorm, then elemWiseAdd, then activation - - + - Returns the inner handle. + rnn cell formulas do not use biases - + - This function initializes a previously created dropout descriptor object. If states argument is equal to - NULL, random number generator states won't be initialized, and only dropout value will be set. No other - function should be writing to the memory + rnn cell formulas use one input bias in input GEMM - The probability with which the value from input would be propagated through the dropout layer. - Pointer to user-allocated GPU memory that will hold random number generator states. - Specifies size in bytes of the provided memory for the states. - Seed used to initialize random number generator states. - + - An opaque structure holding the description - of a filter dataset. + default, rnn cell formulas use two bias vectors - + + rnn cell formulas use one recurrent bias in recurrent GEMM - + - For dispose + disables LSTM cell clipping - + - Dispose + enables LSTM cell clipping - + - For IDisposable + padded, outer stride from one time-step to the next - - + - Returns the inner handle. + sequence length sorted and packed as in basic RNN api - + - This function initializes a previously created filter descriptor object into a 4D filter. - Filters layout must be contiguous in memory. + padded, outer stride from one batch to the next - Data type. - Enumerant holding the layout format. - Number of output feature maps. - Number of input feature maps. - Height of each filter. + + + + Sequence data descriptor + + + + + index in time + + + + + index in batch + + + + + index in beam + + + + + index in vector + + + + + Multi-head attention modes set in attention descriptor + + + + + multiple Q-s map to a single (K,V) set when beam size > 1 + + + + + multiple Q-s map to multiple (K,V) sets when beam size > 1 + + + + + no biases in attention input and output projections + + + + + use biases in attention input and output projections + + + + + input projection weights for 'queries' + + + + + input projection weights for 'keys' + + + + + input projection weights for 'values' + + + + + output projection weights + + + + + input projection bias tensor for 'queries' + + + + + input projection bias for 'keys' + + + + + input projection bias for 'values' + + + + + output projection biases + + + + + add partial gradients to wgrad output buffers + + + + + write partial gradients to wgrad output buffers + + + + + Input normalization mode for loss function + + + + + Input normalization mode for loss function + each op in [ ] can be disabled by passing NULL ptr + [per channel scale], [per channel bias], [activation], convolution, [generate BN stats] + + + + + [per channel scale], [per channel bias], [activation], convolutionBackwardWeights + + + + + utility for BN training in BN-conv fusion + computes the equivalent scale and bias from ySum ySqSum and learned scale, bias + optionally update running stats and generate saved stats + + + + + utility for BN inference in BN-conv fusion + computes the equivalent scale and bias from learned running stats and learned scale, bias + + + + + reserved for future use: convolution, [per channel scale], [per channel bias], [residual add], [activation] + + + + + reserved for future use: [per channel scale], [per channel bias], [residual add], activation, bitmask + + + + + reserved for future use + + + + + set XDESC: pass previously initialized cudnnTensorDescriptor_t + get XDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get XDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_MODE: pass cudnnBatchNormMode_t* + + + + + set BN_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t + get BN_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get BN_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set ACTIVATION_DESC: pass previously initialized cudnnActivationDescriptor_t + get ACTIVATION_DESC: pass previously created cudnnActivationDescriptor_t + + + + + set CONV_DESC: pass previously initialized cudnnConvolutionDescriptor_t + get CONV_DESC: pass previously created cudnnConvolutionDescriptor_t + + + + + set WDESC: pass previously initialized cudnnFilterDescriptor_t + get WDESC: pass previously created cudnnFilterDescriptor_t + + + + + set/get WDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set DWDESC: pass previously initialized cudnnFilterDescriptor_t + get DWDESC: pass previously created cudnnFilterDescriptor_t + + + + + set/get DWDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set YDESC: pass previously initialized cudnnTensorDescriptor_t + get YDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get YDATA_Placeholder: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set DYDESC: pass previously initialized cudnnTensorDescriptor_t + get DYDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get DYDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set YSTATS_DESC: pass previously initialized cudnnTensorDescriptor_t + get YSTATS_DESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get YSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get YSQSUM_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set BN_SCALEBIAS_MEANVAR_DESC: pass previously initialized cudnnTensorDescriptor_t + get BN_SCALEBIAS_MEANVAR_DESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get BN_SCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_BIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_SAVED_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_SAVED_INVSTD_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_RUNNING_MEAN_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_RUNNING_VAR_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set ZDESC: pass previously initialized cudnnTensorDescriptor_t + get ZDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get ZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set BN_Z_EQSCALEBIAS_DESC: pass previously initialized cudnnTensorDescriptor_t + get BN_Z_EQSCALEBIAS_DESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get BN_Z_EQSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_Z_EQBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set ACTIVATION_BITMASK_DESC: pass previously initialized cudnnTensorDescriptor_t + get ACTIVATION_BITMASK_DESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get ACTIVATION_BITMASK_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set DXDESC: pass previously initialized cudnnTensorDescriptor_t + get DXDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get DXDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set DZDESC: pass previously initialized cudnnTensorDescriptor_t + get DZDESC: pass previously created cudnnTensorDescriptor_t + + + + + set/get DZDATA_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_DSCALE_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set/get BN_DBIAS_PLACEHOLDER: pass cudnnFusedOpsPointerPlaceHolder_t* + + + + + set: pass void* pointing to dev memory + get: pass void** pointing to host memory + + + + + set/get: pass size_t* pointing to host memory + + + + + set/get: pass int64_t* pointing to host memory + + + + + set/get: pass double* pointing to host memory + + + + + set/get: pass double* pointing to host memory + + + + + An opaque structure holding the + description of a generic n-D dataset. + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + This function initializes a previously created dropout descriptor object. If states argument is equal to + NULL, random number generator states won't be initialized, and only dropout value will be set. No other + function should be writing to the memory + + The probability with which the value from input would be propagated through the dropout layer. + Pointer to user-allocated GPU memory that will hold random number generator states. + Specifies size in bytes of the provided memory for the states. + Seed used to initialize random number generator states. + + + + An opaque structure holding the description + of a filter dataset. + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + This function initializes a previously created filter descriptor object into a 4D filter. + Filters layout must be contiguous in memory. + + Data type. + Enumerant holding the layout format. + Number of output feature maps. + Number of input feature maps. + Height of each filter. Width of each filter. @@ -6108,6 +6835,93 @@ Array of dimension of at least nbDimsRequested that will be filled with the filter parameters from the provided filter descriptor. + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + @@ -6667,50 +7481,110 @@ one instance, and cudnnSetReduceTensorDescriptor() must be used to initialize this instance. - + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + For dispose - + Dispose - + For IDisposable - + Returns the inner handle. - + - - - - - - + + + + + + + + + + + + + + + + + + + + + - - - - - @@ -6775,7 +7649,7 @@ An array of tensor descriptors describing the input to each recurrent iteration. Minimum amount of GPU memory needed as reserve space to be able to train an RNN with the specified descriptor and input tensors. - + This function is used to query the amount of parameter space required to execute the RNN described by rnnDesc with inputs dimensions defined by xDesc. @@ -7373,11 +8247,161 @@ - + + + The math type specified in a given RNN descriptor. + + + + The math type specified in a given RNN descriptor. + + + The cudnnSetRNNProjectionLayers() function should be called after cudnnSetRNNDescriptor() to enable the "recurrent" and/or "output" projection in a recursive neural network + + The size of the LSTM cell output after the “recurrent” projection. This value should not be larger than hiddenSize programmed via cudnnSetRNNDescriptor(). + This parameter should be zero. + + + + This function retrieves the current RNN “projection” parameters. By default the projection feature is disabled so invoking this function immediately after cudnnSetRNNDescriptor() will yield recProjSize equal to hiddenSize and outProjSize set to zero. The cudnnSetRNNProjectionLayers() method enables the RNN projection. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This function attempts all available cuDNN algorithms for cudnnRNNForwardInference, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnAlgorithmPerformance_t. These metrics are written in sorted fashion where the first element has the lowest compute time. + + + + + This function attempts all available cuDNN algorithms for cudnnRNNForwardTraining, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnAlgorithmPerformance_t. These metrics are written in sorted fashion where the first element has the lowest compute time. + + + + + + + + + + + + + + + This function attempts all available cuDNN algorithms for cudnnRNNForwardInference, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnAlgorithmPerformance_t. These metrics are written in sorted fashion where the first element has the lowest compute time. + + + + + This function attempts all available cuDNN algorithms for cudnnRNNForwardTraining, using user-allocated GPU memory, and outputs performance metrics to a user-allocated array of cudnnAlgorithmPerformance_t. These metrics are written in sorted fashion where the first element has the lowest compute time. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + An opaque structure holding the @@ -7413,13 +8437,86 @@ - This function destroys a previously created spatial transformer descriptor object. + Enumerant to specify the sampler type. Data type. Dimension of the transformed tensor. Array of dimension nbDims containing the size of the transformed tensor for every dimension. + + + This function generates a grid of coordinates in the input tensor corresponding to each pixel from the output tensor. + + Affine transformation matrix. It should be of size n*2*3 for a 2d transformation, where n is the number of images specified in stDesc. + A grid of coordinates. It is of size n*h*w*2 for a 2d transformation, where n, h, w is specified in stDesc. In the 4th dimension, the first coordinate is x, and the second coordinate is y. + + + + This function computes the gradient of a grid generation operation. + + Data pointer to GPU memory contains the input differential data. + Data pointer to GPU memory contains the output differential data. + + + + + This function performs a sampler operation and generates the output tensor using the grid given by the grid generator. + + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + A grid of coordinates generated by cudnnSpatialTfGridGeneratorForward. + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized output tensor descriptor. + Data pointer to GPU memory associated with the output tensor descriptor yDesc. + + + + This function performs a sampler operation and generates the output tensor using the grid given by the grid generator. + + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + A grid of coordinates generated by cudnnSpatialTfGridGeneratorForward. + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized output tensor descriptor. + Data pointer to GPU memory associated with the output tensor descriptor yDesc. + + + + This function computes the gradient of a sampling operation. + + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized output differential tensor descriptor. + Data pointer to GPU memory associated with the output tensor descriptor dxDesc. + Pointer to scaling factor (in host memory) used to blend the gradient outputs dgrid with prior value in the destination pointer as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input differential tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor dyDesc. + A grid of coordinates generated by cudnnSpatialTfGridGeneratorForward. + Pointer to scaling factor (in host memory) used to blend the gradient outputs dgrid with prior value in the destination pointer as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Data pointer to GPU memory contains the output differential data. + + + + This function computes the gradient of a sampling operation. + + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor xDesc. + Pointer to scaling factor (in host memory) used to blend the source value with prior value in the destination tensor as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized output differential tensor descriptor. + Data pointer to GPU memory associated with the output tensor descriptor dxDesc. + Pointer to scaling factor (in host memory) used to blend the gradient outputs dgrid with prior value in the destination pointer as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Handle to the previously initialized input differential tensor descriptor. + Data pointer to GPU memory associated with the tensor descriptor dyDesc. + A grid of coordinates generated by cudnnSpatialTfGridGeneratorForward. + Pointer to scaling factor (in host memory) used to blend the gradient outputs dgrid with prior value in the destination pointer as follows: dstValue = alpha[0]*srcValue + beta[0]*priorDstValue. + Data pointer to GPU memory contains the output differential data. + An opaque structure holding the @@ -7537,5 +8634,46 @@ Array of dimension of at least nbDimsRequested that will be filled with the strides from the provided tensor descriptor. + + + An opaque structure holding the + description of a generic n-D dataset. + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the inner handle. + + + + + Initialize a previously created tensor transform descriptor. + + + + + Retrieves the values stored in a previously initialized tensor transform descriptor. + + diff --git a/src/external/ManagedCuda/CudaDNN.dll b/src/external/ManagedCuda/CudaDNN.dll index 8d4a59a4..677337a2 100644 Binary files a/src/external/ManagedCuda/CudaDNN.dll and b/src/external/ManagedCuda/CudaDNN.dll differ diff --git a/src/external/ManagedCuda/ManagedCuda.dll b/src/external/ManagedCuda/ManagedCuda.dll index f3bb24a8..063da763 100644 Binary files a/src/external/ManagedCuda/ManagedCuda.dll and b/src/external/ManagedCuda/ManagedCuda.dll differ diff --git a/src/external/ManagedCuda/ManagedCuda.xml b/src/external/ManagedCuda/ManagedCuda.xml index d215ac17..df8eef79 100644 --- a/src/external/ManagedCuda/ManagedCuda.xml +++ b/src/external/ManagedCuda/ManagedCuda.xml @@ -4,19892 +4,16452 @@ ManagedCuda - + - CUDA array + Flags to register a graphics resource - + - + Specifies no hints about how this resource will be used. + It is therefore assumed that this resource will be read + from and written to by CUDA. This is the default value. - + - CUDA linker + Specifies that CUDA will not write to this resource. - + - + Specifies that CUDA will not read from this resource and + will write over the entire contents of the resource, so + none of the data previously stored in the resource will + be preserved. - + - CUDA mipmapped array + Specifies that CUDA will bind this resource to a surface reference. - + - + - Cuda context + Flags for mapping and unmapping graphics interop resources - + - + Specifies no hints about how this resource will be used. + It is therefore assumed that this resource will be read from and written to by CUDA. This is the default value. - + - Cuda device + Specifies that CUDA will not write to this resource. - + - + Specifies that CUDA will not read from + this resource and will write over the entire contents of the resource, so none of the data previously stored in the + resource will be preserved. - + - Device that represents the CPU + CUTexRefSetFlags - + - Device that represents an invalid device + - + - Pointer to CUDA device memory + Read the texture as integers rather than promoting the values to floats in the range [0,1]. + Flag for - + - + Use normalized texture coordinates in the range [0,1) instead of [0,dim). + Flag for - + - + Perform sRGB -> linear conversion during texture read. - - - + - + Disable any trilinear filtering optimizations. - - - + - + Enable seamless cube map filtering. - - - - + - + CUDA driver API initialization flags - - - - + - + Currently no initialization flags are defined. - - - - + - + CUDA driver API Context Enable Peer Access flags - - - - + - Returns true if both objects are of type CUdeviceptr and if both Pointer member is equal. + Currently no flags are defined. - - - + - Overrides object.GetHashCode() + CUDA stream flags - - + - override ToString() + For compatibilty with pre Cuda 5.0, equal to Default - - + - + Default stream flag - - + - The on which a pointer was allocated or registered + Stream does not synchronize with stream 0 (the NULL stream) - + - The describing the physical location of a pointer + CudaCooperativeLaunchMultiDeviceFlags - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + No flags - + - The address at which a pointer's memory may be accessed on the host + If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only + waits for prior work in the stream corresponding to that GPU to complete before the + kernel begins execution. - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + If set, any subsequent work pushed in a stream that participated in a call to + ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + the GPU corresponding to that stream to complete before it begins execution. - + - Synchronize every synchronous memory operation initiated on this region + CUDAArray3DFlags - + - A process-wide unique ID for an allocated memory region + No flags - + - Indicates if the pointer points to managed memory + if set, the CUDA array contains an array of 2D slices and + the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the + number of slices, not the depth of a 3D array. - + - Cuda event + if set, the CUDA array contains an array of layers where each layer is either a 1D + or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + of layers, not the depth of a 3D array. - + - + this flag must be set in order to bind a surface reference + to the CUDA array - + - Cuda function / kernel + If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + width of such a CUDA array must be equal to its height, and Depth must be six. + If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + and Depth must be a multiple of six. - + - + This flag must be set in order to perform texture gather operations on a CUDA array. - + - Cuda module + This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. - + - + This flag indicates that the CUDA array may be bound as a color target in an external graphics API - + - Cuda stream + This flag if set indicates that the CUDA array or CUDA mipmapped array + is a sparse CUDA array or CUDA mipmapped array respectively - + - + This flag if set indicates that the CUDA array or CUDA mipmapped array will allow deferred memory mapping - + - Returns the CUDA NULL stream (0) + This flag indicates that the CUDA array will be used for hardware accelerated video encode/decode operations. - + - Stream handle that can be passed as a CUstream to use an implicit stream - with legacy synchronization behavior. + CUMemHostAllocFlags. All of these flags are orthogonal to one another: a developer may allocate memory that is portable, mapped and/or + write-combined with no restrictions. - + - Stream handle that can be passed as a CUstream to use an implicit stream - with per-thread synchronization behavior. + No flags - + - CUDA texture reference + The memory returned by this call will be considered as pinned memory + by all CUDA contexts, not just the one that performed the allocation. - + - + Maps the allocation into the CUDA address space. The device pointer + to the memory may be obtained by calling . This feature is available only on + GPUs with compute capability greater than or equal to 1.1. - + - CUDA surface reference + Allocates the memory as write-combined (WC). WC memory + can be transferred across the PCI Express bus more quickly on some system configurations, but cannot be read + efficiently by most CPUs. WC memory is a good option for buffers that will be written by the CPU and read by + the GPU via mapped pinned memory or host->device transfers. + If set, host memory is allocated as write-combined - fast to write, + faster to DMA, slow to read except via SSE4 streaming load instruction + (MOVNTDQA). - + - + Context creation flags. + The two LSBs of the flags parameter can be used to control how the OS thread, which owns the CUDA context at + the time of an API call, interacts with the OS scheduler when waiting for results from the GPU. - + - CUDA graphics interop resource (DirectX / OpenGL) + The default value if the flags parameter is zero, uses a heuristic based on the + number of active CUDA contexts in the process C and the number of logical processors in the system P. If C > + P, then CUDA will yield to other OS threads when waiting for the GPU, otherwise CUDA will not yield while + waiting for results and actively spin on the processor. - + - + Instruct CUDA to actively spin when waiting for results from the GPU. This can decrease + latency when waiting for the GPU, but may lower the performance of CPU threads if they are performing + work in parallel with the CUDA thread. - + - CUDA texture object + Instruct CUDA to yield its thread when waiting for results from the GPU. This can + increase latency when waiting for the GPU, but can increase the performance of CPU threads performing work + in parallel with the GPU. - + - + Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the GPU to finish work. - + - CUDA surface object + No description found... - + - + Instruct CUDA to support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. - + - CUDA definition of UUID + Instruct CUDA to not reduce local memory after resizing local memory + for a kernel. This can prevent thrashing by local memory allocations when launching many kernels with high + local memory usage at the cost of potentially increased memory usage. - + - + Trigger coredumps from exceptions in this context - + - 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms + Enable user pipe to trigger coredumps in this context - + - + Force synchronous blocking on cudaMemcpy/cudaMemset - + - Interprocess Handle for Events + No description found... - + - + CUMemHostRegisterFlags. All of these flags are orthogonal to one another: a developer may allocate memory that is portable or mapped + with no restrictions. - + - Interprocess Handle for Memory + No flags - + - + The memory returned by this call will be considered as pinned memory + by all CUDA contexts, not just the one that performed the allocation. - + - half precission floating point + Maps the allocation into the CUDA address space. The device pointer + to the memory may be obtained by calling . This feature is available only on + GPUs with compute capability greater than or equal to 1.1. - + - two half precission floating point (x,y) + If set, the passed memory pointer is treated as pointing to some + memory-mapped I/O space, e.g. belonging to a third-party PCIe device. + On Windows the flag is a no-op. + On Linux that memory is marked as non cache-coherent for the GPU and + is expected to be physically contiguous. + On all other platforms, it is not supported and CUDA_ERROR_INVALID_VALUE + is returned. - + - CUDA external memory + If set, the passed memory pointer is treated as pointing to memory that is + considered read-only by the device. On platforms without + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is + required in order to register memory mapped to the CPU as read-only. Support + for the use of this flag can be queried from the device attribute + CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with + a current context associated with a device that does not have this attribute + set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. - + - + Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers - + - CUDA external semaphore + No flags - + - + Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers - + - CUDA graph + Flag for cuStreamAddCallback() - + - + No flags - + - CUDA graph node + Event creation flags - + - + Default event creation flag. - + - Returns the type of the Node + Specifies that event should use blocking synchronization. A CPU thread + that uses to wait on an event created with this flag will block until the event has actually + been recorded. - + - Sets the parameters of host node nodeParams. + Event will not record timing data - - + - Sets the parameters of kernel node nodeParams. + Event is suitable for interprocess use. CUEventFlags.DisableTiming must be set - - + - Sets the parameters of memcpy node nodeParams. + Event record flags - - + - Sets the parameters of memset node nodeParams. + Default event record flag - - + - Gets the parameters of host node. + When using stream capture, create an event record node + instead of the default behavior. This flag is invalid + when used outside of capture. - - + - Gets the parameters of kernel node. + Event wait flags - - + - Gets the parameters of memcpy node. + Default event wait flag - - + - Gets the parameters of memset node. + When using stream capture, create an event wait node + instead of the default behavior. This flag is invalid + when used outside of capture. - - + - Only for ChildGraphNodes + Flags for ::cuStreamWaitValue32 - - + - Returns a node's dependencies. + Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit values). Note this is a cyclic comparison which ignores wraparound. (Default behavior.) - - + - Returns a node's dependent nodes + Wait until *addr == value. - + - CUDA executable graph + Wait until (*addr & value) != 0. - + - + Wait until ~(*addr | value) != 0. Support for this operation can be + queried with ::cuDeviceGetAttribute() and ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + Generally, this requires compute capability 7.0 or greater. - + - Legacy device properties + Follow the wait operation with a flush of outstanding remote writes. This + means that, if a remote write operation is guaranteed to have reached the + device before the wait can be satisfied, that write is guaranteed to be + visible to downstream device work. The device is permitted to reorder + remote writes internally. For example, this flag would be required if + two remote writes arrive in a defined order, the wait is satisfied by the + second write, and downstream work needs to observe the first write. - + - Maximum number of threads per block + Flags for ::cuStreamWriteValue32 - + - Maximum size of each dimension of a block + Default behavior - + - Maximum size of each dimension of a grid + Permits the write to be reordered with writes which were issued + before it, as a performance optimization. Normally, ::cuStreamWriteValue32 will provide a memory fence before the + write, which has similar semantics to __threadfence_system() but is scoped to the stream rather than a CUDA thread. - + - Shared memory available per block in bytes + Indicates that the external memory object is a dedicated resource - + - Constant memory available on device in bytes + No flags - + - Warp size in threads. Also called SIMD width. + Indicates that the external memory object is a dedicated resource - + - Maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through - . + parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS - + - 32-bit registers available per block + When the /p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS + contains this flag, it indicates that signaling an external semaphore object + should skip performing appropriate memory synchronization operations over all + the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + which otherwise are performed by default to ensure data coherency with other + importers of the same NvSciBuf memory objects. - + - Clock frequency in kilohertz + When the /p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS + contains this flag, it indicates that waiting on an external semaphore object + should skip performing appropriate memory synchronization operations over all + the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + which otherwise are performed by default to ensure data coherency with other + importers of the same NvSciBuf memory objects. - + - Alignment requirement for textures. texture base addresses that are aligned to textureAlign bytes do not - need an offset applied to texture fetches. + flags of ::cuDeviceGetNvSciSyncAttributes - + - 2D memory copy parameters + When /p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + it indicates that application needs signaler specific NvSciSyncAttr + to be filled by ::cuDeviceGetNvSciSyncAttributes. - + - Source X in bytes + When /p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + it indicates that application needs waiter specific NvSciSyncAttr + to be filled by ::cuDeviceGetNvSciSyncAttributes. - + - Source Y + Flags for specifying particular handle types - + - Source memory type (host, device, array) + Does not allow any export mechanism. - + - Source host pointer + Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) - + - Source device pointer + Allows a Win32 NT handle to be used for exporting. (HANDLE) - + - Source array reference + Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) - + - Source pitch (ignored when src is array) + Allows a fabric handle to be used for exporting. (CUmemFabricHandle) - + - Destination X in bytes + Specifies the memory protection flags for mapping. - + - Destination Y + Default, make the address range not accessible - + - Destination memory type (host, device, array) + Make the address range read accessible - + - Destination host pointer + Make the address range read-write accessible - + - Destination device pointer + Flag for requesting different optimal and required granularities for an allocation. - + - Destination array reference + Minimum required granularity for allocation - + - Destination pitch (ignored when dst is array) + Recommended granularity for allocation for best performance - + - Width of 2D memory copy in bytes + Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS - + + + + - Height of 2D memory copy + ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. - + - 3D memory copy parameters + The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. - + - Source X in bytes + The additional write options for ::cuGraphDebugDotPrint - + + + + - Source Y + Output all debug data as if every debug flag is enabled - + - Source Z + Use CUDA Runtime structures for output - + - Source LOD + Adds CUDA_KERNEL_NODE_PARAMS values to output - + - Source memory type (host, device, array) + Adds CUDA_MEMCPY3D values to output - + - Source host pointer + Adds CUDA_MEMSET_NODE_PARAMS values to output - + - Source device pointer + Adds CUDA_HOST_NODE_PARAMS values to output - + - Source array reference + Adds CUevent handle from record and wait nodes to output - + - Must be NULL + Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output - + - Source pitch (ignored when src is array) + Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output - + - Source height (ignored when src is array; may be 0 if Depth==1) + Adds CUkernelNodeAttrValue values to output - + - Destination X in bytes + Adds node handles and every kernel function handle to output - + - Destination Y + Adds memory alloc node parameters to output - + - Destination Z + Adds memory free node parameters to output - + - Destination LOD + Adds batch mem op node parameters to output - + - Destination memory type (host, device, array) + Adds edge numbering information - + - Destination host pointer + Adds conditional node parameters to output - + - Destination device pointer + Flags for user objects for graphs - - - Destination array reference - + + - + - Must be NULL + Indicates the destructor execution is not synchronized by any CUDA handle. - + - Destination pitch (ignored when dst is array) + Flags for retaining user object references for graphs - + + + + - Destination height (ignored when dst is array; may be 0 if Depth==1) + Transfer references from the caller rather than creating new references. - + - Width of 3D memory copy in bytes + Flags for instantiating a graph - + + + + - Height of 3D memory copy + Automatically free memory allocated in a graph before relaunching. - + - Depth of 3D memory copy + Automatically upload the graph after instantiaton. - + - 3D memory copy parameters + Instantiate the graph to be launchable from the device. - + - Source X in bytes + Run the graph using the per-node priority attributes rather than the priority of the stream it is launched into. - + - Source Y + Flags for querying different granularities for a multicast object - + - Source Z + Minimum required granularity - + - Source LOD + Recommended granularity for best performance - + - Source memory type (host, device, array) + Flags for CUgreenCtxCreate - + - Source host pointer + None - + - Source device pointer + Required. Creates a default stream to use inside the green context - + - Source array reference + - + - Source context (ignored with srcMemoryType is array) + None - + - Source pitch (ignored when src is array) + - + - Source height (ignored when src is array; may be 0 if Depth==1) + - + - Destination X in bytes + Flags for controlling coredump contents - + + + + + + + + + + + + + + + + + + + + + + + + + - Destination Y + Flag for requesting handle type for address range. - + - Destination Z + Indicates that DMA_BUF handle should be mapped via PCIe BAR1 - + - Destination LOD + CUDA stream callback + The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + CUDA_SUCCESS or any persistent error on the stream. + User parameter provided at registration. - + - Destination memory type (host, device, array) + Block size to per-block dynamic shared memory mapping for a certain + kernel. + e.g.: + If no dynamic shared memory is used: x => 0 + If 4 bytes shared memory per thread is used: x = 4 * x + block size + The dynamic shared memory needed by a block - + - Destination host pointer + CUDA host function + Argument value passed to the function - + - Destination device pointer + CUDA async notification callback + Information describing what actions to take as a result of this trim notification. + Pointer to user defined data provided at registration. + The callback handle associated with this specific callback. - + - Destination array reference + Texture reference addressing modes - + - Destination context (ignored with dstMemoryType is array) + Wrapping address mode - + - Destination pitch (ignored when dst is array) + Clamp to edge address mode - + - Destination height (ignored when dst is array; may be 0 if Depth==1) + Mirror address mode - + - Width of 3D memory copy in bytes + Border address mode - + - Height of 3D memory copy + Array formats - + - Depth of 3D memory copy + Unsigned 8-bit integers - + - Array descriptor + Unsigned 16-bit integers - + - Width of array + Unsigned 32-bit integers - + - Height of array + Signed 8-bit integers - + - Array format + Signed 16-bit integers - + - Channels per array element + Signed 32-bit integers - + - 3D array descriptor + 16-bit floating point - + - Width of 3D array + 32-bit floating point - + - Height of 3D array + 8-bit YUV planar format, with 4:2:0 sampling - + - Depth of 3D array + 1 channel unsigned 8-bit normalized integer - + - Array format + 2 channel unsigned 8-bit normalized integer - + - Channels per array element + 4 channel unsigned 8-bit normalized integer - + - Flags + 1 channel unsigned 16-bit normalized integer - + - Idea of a SizeT type from http://blogs.hoopoe-cloud.com/index.php/tag/cudanet/, entry from Tuesday, September 15th, 2009 + 2 channel unsigned 16-bit normalized integer - + - + 4 channel unsigned 16-bit normalized integer - - + - + 1 channel signed 8-bit normalized integer - - + - + 2 channel signed 8-bit normalized integer - - + - + 4 channel signed 8-bit normalized integer - - + - + 1 channel signed 16-bit normalized integer - - + - + 2 channel signed 16-bit normalized integer - - + - + 4 channel signed 16-bit normalized integer - - - + - + 4 channel unsigned normalized block-compressed (BC1 compression) format - - - + - + 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding - - - + - + 4 channel unsigned normalized block-compressed (BC2 compression) format - - - + - + 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding - - - + - + 4 channel unsigned normalized block-compressed (BC3 compression) format - - - + - + 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding - - - + - + 1 channel unsigned normalized block-compressed (BC4 compression) format - - - + - + 1 channel signed normalized block-compressed (BC4 compression) format - - - + - + 2 channel unsigned normalized block-compressed (BC5 compression) format - - - + - + 2 channel signed normalized block-compressed (BC5 compression) format - - - + - + 3 channel unsigned half-float block-compressed (BC6H compression) format - - - + - + 3 channel signed half-float block-compressed (BC6H compression) format - - - - + - + 4 channel unsigned normalized block-compressed (BC7 compression) format - - - - + - Define operator + on converted to ulong values to avoid fall back to int + 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding - - - - + - Define operator + on converted to ulong values to avoid fall back to int + 10-bit YUV planar format, with 4:2:0 sampling - - - - + - Define operator + on converted to ulong values to avoid fall back to int + 16-bit YUV planar format, with 4:2:0 sampling - - - - + - Define operator + on converted to ulong values to avoid fall back to int + 8-bit YUV planar format, with 4:2:2 sampling - - - - + - Define operator + on converted to ulong values to avoid fall back to int + 10-bit YUV planar format, with 4:2:2 sampling - - - - + - Define operator - on converted to ulong values to avoid fall back to int + 16-bit YUV planar format, with 4:2:2 sampling - - - - + - Define operator - on converted to ulong values to avoid fall back to int + 2 channel, 8-bit YUV packed planar format, with 4:2:2 sampling - - - - + - Define operator - on converted to ulong values to avoid fall back to int + 2 channel, 10-bit YUV packed planar format, with 4:2:2 sampling - - - - + - Define operator - on converted to ulong values to avoid fall back to int + 2 channel, 16-bit YUV packed planar format, with 4:2:2 sampling - - - - + - Define operator - on converted to ulong values to avoid fall back to int + 4 channel, 8-bit YUV packed planar format, with 4:4:4 sampling - - - - + - Define operator * on converted to ulong values to avoid fall back to int + 10-bit YUV packed planar format, with 4:4:4 sampling - - - - + - Define operator * on converted to ulong values to avoid fall back to int + 4 channel, 12-bit YUV packed planar format, with 4:4:4 sampling - - - - + - Define operator * on converted to ulong values to avoid fall back to int + 3 channel 8-bit YUV planar format, with 4:4:4 sampling - - - - + - Define operator * on converted to ulong values to avoid fall back to int + 3 channel 10-bit YUV planar format, with 4:4:4 sampling - - - - + - Define operator * on converted to ulong values to avoid fall back to int + 3 channel 8-bit YUV semi-planar format, with 4:4:4 sampling - - - - + - Define operator / on converted to ulong values to avoid fall back to int + 3 channel 16-bit YUV semi-planar format, with 4:4:4 sampling - - - - + - Define operator / on converted to ulong values to avoid fall back to int + 4 channel unorm R10G10B10A2 RGB format - - - - + - Define operator / on converted to ulong values to avoid fall back to int + Compute mode that device is currently in. - - - - + - Define operator / on converted to ulong values to avoid fall back to int + Default mode - Device is not restricted and can have multiple CUDA + contexts present at a single time. - - - - + - Define operator / on converted to ulong values to avoid fall back to int + Compute-prohibited mode - Device is prohibited from creating + new CUDA contexts. - - - - + - Define operator > on converted to ulong values to avoid fall back to int + Compute-exclusive-process mode (Only one context used by a + single process can be present on this device at a time) - - - - + - Define operator > on converted to ulong values to avoid fall back to int + Memory advise values - - - - + - Define operator > on converted to ulong values to avoid fall back to int + Data will mostly be read and only occasionally be written to - - - - + - Define operator > on converted to ulong values to avoid fall back to int + Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY - - - - + - Define operator > on converted to ulong values to avoid fall back to int + Set the preferred location for the data as the specified device - - - - + - Define operator < on converted to ulong values to avoid fall back to int + Clear the preferred location for the data - - - - + - Define operator < on converted to ulong values to avoid fall back to int + Data will be accessed by the specified device, so prevent page faults as much as possible - - - - + - Define operator < on converted to ulong values to avoid fall back to int + Let the Unified Memory subsystem decide on the page faulting policy for the specified device - - - - + - Define operator < on converted to ulong values to avoid fall back to int + Context Attach flags - - - - + - Define operator < on converted to ulong values to avoid fall back to int + None - - - - + - + Device properties - - - + - returns this.value.ToString() + Maximum number of threads per block - - + - Returns this.value.GetHashCode() + Maximum block dimension X - - + - Inner struct for CudaResourceDesc + Maximum block dimension Y - + - Device pointer + Maximum block dimension Z - + - Array format + Maximum grid dimension X - + - Channels per array element + Maximum grid dimension Y - + - Size in bytes + Maximum grid dimension Z - + - Inner struct for CudaResourceDesc + Maximum amount of shared memory + available to a thread block in bytes; this amount is shared by all thread blocks simultaneously resident on a + multiprocessor - + - Device pointer + Deprecated, use MaxSharedMemoryPerBlock - + - Array format + Memory available on device for __constant__ variables in a CUDA C kernel in bytes - + - Channels per array element + Warp size in threads - + - Width of the array in elements + Maximum pitch in bytes allowed by the memory copy functions + that involve memory regions allocated through - + - Height of the array in elements + Deprecated, use MaxRegistersPerBlock - + - Pitch between two rows in bytes + Maximum number of 32-bit registers available + to a thread block; this number is shared by all thread blocks simultaneously resident on a multiprocessor - + - Mimics the union "CUDA_RESOURCE_DESC.res" in cuda.h + Typical clock frequency in kilohertz - + - CUDA array + Alignment requirement; texture base addresses + aligned to textureAlign bytes do not need an offset applied to texture fetches - + - CUDA mipmapped array + 1 if the device can concurrently copy memory between host + and device while executing a kernel, or 0 if not - + - Linear memory + Number of multiprocessors on device - + - Linear pitched 2D memory + Specifies whether there is a run time limit on kernels. + 1 if there is a run time limit for kernels executed on the device, or 0 if not - + - CUDA Resource descriptor + Device is integrated with host memory. 1 if the device is integrated with the memory subsystem, or 0 if not - + - + Device can map host memory into CUDA address space. 1 if the device can map host memory into the + CUDA address space, or 0 if not - - + - + Compute mode (See for details) - - + - + Maximum 1D texture width - - + - + Maximum 2D texture width - - + - + Maximum 2D texture height - - + - + Maximum 3D texture width - - + - + Maximum 3D texture height - - + - + Maximum 3D texture depth - - + - + Maximum texture array width - - + - + Maximum texture array height - - + - + Maximum slices in a texture array - - + - + Alignment requirement for surfaces - - + - + Device can possibly execute multiple kernels concurrently. + 1 if the device supports executing multiple kernels + within the same context simultaneously, or 0 if not. It is not guaranteed that multiple kernels will be resident on + the device concurrently so this feature should not be relied upon for correctness. - - + - + Device has ECC support enabled. 1 if error correction is enabled on the device, 0 if error correction + is disabled or not supported by the device. - - + - + PCI bus ID of the device - - + - + PCI device ID of the device - - + - + Device is using TCC driver model - - + - + Peak memory clock frequency in kilohertz - - + - + Global memory bus width in bits - - + - + Size of L2 cache in bytes - - + - + Maximum resident threads per multiprocessor - - + - + Number of asynchronous engines - - + - + Device shares a unified address space with the host - - + - + Maximum 1D layered texture width - - + - + Maximum layers in a 1D layered texture - - + - + PCI domain ID of the device - - + - + Pitch alignment requirement for textures - - + - + Maximum cubemap texture width/height - - + - + Maximum cubemap layered texture width/height - - + - + Maximum layers in a cubemap layered texture - - + - + Maximum 1D surface width - - + - + Maximum 2D surface width - - + - + Maximum 2D surface height - - + - + Maximum 3D surface width - - + - + Maximum 3D surface height - - + - + Maximum 3D surface depth - - + - + Maximum 1D layered surface width - - + - + Maximum layers in a 1D layered surface - - + - + Maximum 2D layered surface width - - + - + Maximum 2D layered surface height - - + - + Maximum layers in a 2D layered surface - - + - + Maximum cubemap surface width - - + - + Maximum cubemap layered surface width - - + - + Maximum layers in a cubemap layered surface - - + - + Maximum 1D linear texture width - - + - + Maximum 2D linear texture width - - + - + Maximum 2D linear texture height - - + - + Maximum 2D linear texture pitch in bytes - - + - Resource type + Maximum mipmapped 2D texture width - + - Mimics the union in C++ + Maximum mipmapped 2D texture height - + - Flags (must be zero) + Major compute capability version number - + - Texture descriptor + Minor compute capability version number - + - Creates a new CudaTextureDescriptor + Maximum mipmapped 1D texture width - Address modes for all dimensions - Filter mode - Flags - + - Creates a new CudaTextureDescriptor + Device supports stream priorities - Address modes for all dimensions - Filter mode - Flags - borderColor (array of size 4) - + - Creates a new CudaTextureDescriptor + Device supports caching globals in L1 - Address modes for dimension 0 - Address modes for dimension 1 - Address modes for dimension 2 - Filter mode - Flags - + - Creates a new CudaTextureDescriptor + Device supports caching locals in L1 - Address modes for dimension 0 - Address modes for dimension 1 - Address modes for dimension 2 - Filter mode - Flags - borderColor (array of size 4) - + - Creates a new CudaTextureDescriptor + Maximum shared memory available per multiprocessor in bytes - Address modes for all dimensions - Filter mode - Flags - Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic - filtering. This value will be clamped to the range [1,16]. - Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between - two defined mipmap levels. - Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. - Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. - Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - + - Creates a new CudaTextureDescriptor + Maximum number of 32-bit registers available per multiprocessor - Address modes for all dimensions - Filter mode - Flags - Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic - filtering. This value will be clamped to the range [1,16]. - Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between - two defined mipmap levels. - Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. - Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. - Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - borderColor (array of size 4) - + - Creates a new CudaTextureDescriptor + Device can allocate managed memory on this system - Address modes for dimension 0 - Address modes for dimension 1 - Address modes for dimension 2 - Filter mode - Flags - Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic - filtering. This value will be clamped to the range [1,16]. - Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between - two defined mipmap levels. - Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. - Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. - Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - + - Creates a new CudaTextureDescriptor + Device is on a multi-GPU board - Address modes for dimension 0 - Address modes for dimension 1 - Address modes for dimension 2 - Filter mode - Flags - Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic - filtering. This value will be clamped to the range [1,16]. - Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between - two defined mipmap levels. - Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. - Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. - Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - borderColor (array of size 4) - + - Address modes + Unique id for a group of devices on the same multi-GPU board - + - Filter mode + Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware) - + - Flags + Ratio of single precision performance (in floating-point operations per second) to double precision performance - + - Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic - filtering. This value will be clamped to the range [1,16]. + Device supports coherently accessing pageable memory without calling cudaHostRegister on it - + - Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between - two defined mipmap levels. + Device can coherently access managed memory concurrently with the CPU - + - Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. + Device supports compute preemption. - + - Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. + Device can access host registered memory at the same virtual address as the CPU. - + - Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. + ::cuStreamBatchMemOp and related APIs are supported. - + - Border Color + 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. - + - Resource view descriptor + ::CU_STREAM_WAIT_VALUE_NOR is supported. - + - Resource view format + Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel - + - Width of the resource view + Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice - + - Height of the resource view + Maximum optin shared memory per block - + - Depth of the resource view + Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. - + - First defined mipmap level + Device supports host memory registration via ::cudaHostRegister. - + - Last defined mipmap level + Device accesses pageable memory via the host's page tables. - + - First layer index + The host can directly access managed memory on the device without migration. - + - Last layer index + Deprecated, Use VirtualMemoryManagementSupported - + - GPU Direct v3 tokens + Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs - + - + Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - + - + Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate - + - Per-operation parameters for ::cuStreamBatchMemOp + Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate - - - - - - - - - - - - - - - - - - - - - - + - For driver internal use. Initial value is unimportant. + Maximum number of blocks per multiprocessor - - - - - - - - - - - - - - - - + - For driver internal use. Initial value is unimportant. + Device supports compression of memory - - - - - - - - - - - - - - - - - - - - - - - - - + - Kernel launch parameters + Device's maximum L2 persisting lines capacity setting in bytes - + - Kernel to launch + The maximum value of CUaccessPolicyWindow::num_bytes. - + - Width of grid in blocks + Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate - + - Height of grid in blocks + Shared memory reserved by CUDA driver per block in bytes - + - Depth of grid in blocks + Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays - + - X dimension of each thread block + Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU - + - Y dimension of each thread block + External timeline semaphore interop is supported on the device - + - Z dimension of each thread block + Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs - + - Dynamic shared-memory size per thread block in bytes + Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) - + - Stream identifier + The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum - + - Array of pointers to kernel parameters + GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. - + - GPU kernel node parameters + Handle types supported with mempool based IPC - + - Kernel to launch + Indicates device supports cluster launch - + - Width of grid in blocks + Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays - + - Height of grid in blocks + 64-bit operations are supported in ::cuStreamBatchMemOp and related MemOp APIs. - + - Depth of grid in blocks + ::CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs. - + - X dimension of each thread block + Device supports buffer sharing with dma_buf mechanism. - + - Y dimension of each thread block + Device supports IPC Events. - + - Z dimension of each thread block + Number of memory domains the device supports. - + - Dynamic shared-memory size per thread block in bytes + Device supports accessing memory using Tensor Map. - + - Array of pointers to kernel parameters + Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() - + - Extra options + Device supports unified function pointers. - + - Memset node parameters + NUMA configuration of a device: value is of type ::CUdeviceNumaConfig enum - + - Destination device pointer + NUMA node ID of the GPU memory - + - Pitch of destination device pointer. Unused if height is 1 + Device supports switch multicast and reduction operations. - + - Value to be set + Indicates if contexts created on this device will be shared via MPS - + - Size of each element in bytes. Must be 1, 2, or 4. + NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. - + - Width in bytes, of the row + Device supports CIG with D3D12. - + - Number of rows + The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. - + - Initialieses the struct + The returned valued is the maximum length in bytes of a single decompress operation that is allowed. - - - - - + - Initialieses the struct + The combined 16-bit PCI device ID and 16-bit PCI vendor ID. - - - - - + - Host node parameters + The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. - + - The function to call when the node executes + Device supports HOST_NUMA location IPC between nodes in a multi-node system. - + - Argument to pass to the function + Max elems... - + - Win32 handle referencing the semaphore object. Valid when - type is one of the following: - - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 - - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT - - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP - - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE - Exactly one of 'handle' and 'name' must be non-NULL. If - type is - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT - then 'name' must be NULL. + Texture reference filtering modes - + - Valid NT handle. Must be NULL if 'name' is non-NULL + Point filter mode - + - Name of a valid memory object. Must be NULL if 'handle' is non-NULL. + Linear filter mode - + - External memory handle descriptor + Function properties - + - Type of the handle + The number of threads beyond which a launch of the function would fail. + This number depends on both the function and the device on which the + function is currently loaded. - + - File descriptor referencing the memory object. Valid when type is CUDA_EXTERNAL_MEMORY_DEDICATED + The size in bytes of statically-allocated shared memory required by + this function. This does not include dynamically-allocated shared + memory requested by the user at runtime. - + - Win32 handle referencing the semaphore object. + The size in bytes of statically-allocated shared memory required by + this function. This does not include dynamically-allocated shared + memory requested by the user at runtime. - + - Size of the memory allocation + The size in bytes of thread local memory used by this function. - + - Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED + The number of registers used by each thread of this function. - + - External semaphore handle descriptor + The PTX virtual architecture version for which the function was + compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function + would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA + 3.0. - + - Type of the handle + The binary version for which the function was compiled. This + value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return + the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary + architecture version. - + - File descriptor referencing the semaphore object. Valid when type is CUDA_EXTERNAL_MEMORY_DEDICATED + The attribute to indicate whether the function has been compiled with + user specified option "-Xptxas --dlcm=ca" set. - + - Win32 handle referencing the semaphore object. + The maximum size in bytes of dynamically-allocated shared memory that can be used by + this function. If the user-specified dynamic shared memory size is larger than this + value, the launch will fail. - + - Flags reserved for the future. Must be zero. + On devices where the L1 cache and shared memory use the same hardware resources, + this sets the shared memory carveout preference, in percent of the total resources. + This is only a hint, and the driver can choose a different ratio if required to execute the function. - + - External memory buffer descriptor + If this attribute is set, the kernel must launch with a valid cluster size specified. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - Offset into the memory object where the buffer's base is + The required cluster width in blocks. The values must either all be 0 or all be positive. + The validity of the cluster dimensions is otherwise checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - Size of the buffer + The required cluster height in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - Flags reserved for future use. Must be zero. + The required cluster depth in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - External memory mipmap descriptor + Whether the function can be launched with non-portable cluster size. 1 is + allowed, 0 is disallowed. A non-portable cluster size may only function + on the specific SKUs the program is tested on. The launch might fail if + the program is run on a different hardware platform. + CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking + whether the desired size can be launched on the current device. + Portable Cluster Size + A portable cluster size is guaranteed to be functional on all compute + capabilities higher than the target compute capability. The portable + cluster size for sm_90 is 8 blocks per cluster. This value may increase + for future compute capabilities. + The specific hardware unit may support higher cluster sizes that's not + guaranteed to be portable. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - Offset into the memory object where the base level of the mipmap chain is. + The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - + - Format, dimension and type of base level of the mipmap chain + No descritption found... - + - Total number of levels in the mipmap chain + Function cache configurations - + - External semaphore signal parameters + No preference for shared memory or L1 (default) - + - Parameters for fence objects + Function prefers larger shared memory and smaller L1 cache. - + - Value of fence to be signaled + Function prefers larger L1 cache and smaller shared memory. - + - Value of fence to be signaled + Function prefers equal sized L1 cache and shared memory. - + - Flags reserved for the future. Must be zero. + Cubin matching fallback strategies - + - External semaphore wait parameters + Prefer to compile ptx if exact binary match not found - + - Parameters for fence objects + Prefer to fall back to compatible binary code if exact binary match not found - + - Value of fence to be waited on + Online compiler options - + - Value of fence to be waited on + Max number of registers that a thread may use. + Option type: unsigned int + Applies to: compiler only - + - Flags reserved for the future. Must be zero. + IN: Specifies minimum number of threads per block to target compilation + for + OUT: Returns the number of threads the compiler actually targeted. + This restricts the resource utilization of the compiler (e.g. max + registers) such that a block with the given number of threads should be + able to launch based on register limitations. Note, this option does not + currently take into account any other resource limitations, such as + shared memory utilization. + Option type: unsigned int + Applies to: compiler only - + - Texture reference addressing modes + Returns a float value in the option of the wall clock time, in + milliseconds, spent creating the cubin + Option type: float + Applies to: compiler and linker - + - Wrapping address mode + Pointer to a buffer in which to print any log messsages from PTXAS + that are informational in nature (the buffer size is specified via + option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) + Option type: char* + Applies to: compiler and linker - + - Clamp to edge address mode + IN: Log buffer size in bytes. Log messages will be capped at this size + (including null terminator) + OUT: Amount of log buffer filled with messages + Option type: unsigned int + Applies to: compiler and linker - + - Mirror address mode + Pointer to a buffer in which to print any log messages from PTXAS that + reflect errors (the buffer size is specified via option + ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES) + Option type: char* + Applies to: compiler and linker - + - Border address mode + IN: Log buffer size in bytes. Log messages will be capped at this size + (including null terminator) + OUT: Amount of log buffer filled with messages + Option type: unsigned int + Applies to: compiler and linker - + - Array formats + Level of optimizations to apply to generated code (0 - 4), with 4 + being the default and highest level of optimizations. + Option type: unsigned int + Applies to: compiler only - + - Unsigned 8-bit integers + No option value required. Determines the target based on the current + attached context (default) + Option type: No option value needed + Applies to: compiler and linker - + - Unsigned 16-bit integers + Target is chosen based on supplied ::CUjit_target_enum. This option cannot be + used with cuLink* APIs as the linker requires exact matches. + Option type: unsigned int for enumerated type ::CUjit_target_enum + Applies to: compiler and linker - + - Unsigned 32-bit integers + Specifies choice of fallback strategy if matching cubin is not found. + Choice is based on supplied ::CUjit_fallback_enum. + Option type: unsigned int for enumerated type ::CUjit_fallback_enum + Applies to: compiler only - + - Signed 8-bit integers + Specifies whether to create debug information in output (-g) (0: false, default) + Option type: int + Applies to: compiler and linker - + - Signed 16-bit integers + Generate verbose log messages (0: false, default) + Option type: int + Applies to: compiler and linker - + - Signed 32-bit integers + Generate line number information (-lineinfo) (0: false, default) + Option type: int + Applies to: compiler only - + - 16-bit floating point + Specifies whether to enable caching explicitly (-dlcm) + Choice is based on supplied ::CUjit_cacheMode_enum. + Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum + Applies to: compiler only - + - 32-bit floating point + The below jit options are used for internal purposes only, in this version of CUDA - + - Compute mode that device is currently in. + This jit option is used for internal purpose only. - + - Default mode - Device is not restricted and can have multiple CUDA - contexts present at a single time. + Array of device symbol names that will be relocated to the corresponding + host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES. + Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. + When loading a device module, driver will relocate all encountered + unresolved symbols to the host addresses. + It is only allowed to register symbols that correspond to unresolved + global variables. + It is illegal to register the same device symbol at multiple addresses. + Option type: const char ** + Applies to: dynamic linker only - + - Compute-prohibited mode - Device is prohibited from creating - new CUDA contexts. + Array of host addresses that will be used to relocate corresponding + device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES. + Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. + Option type: void ** + Applies to: dynamic linker only - + - Compute-exclusive-process mode (Only one context used by a - single process can be present on this device at a time) + Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays. + Option type: unsigned int + Applies to: dynamic linker only - + - Memory advise values + Enable link-time optimization (-dlto) for device code (0: false, default) + Option type: int + Applies to: compiler and linker - + - Data will mostly be read and only occassionally be written to + Control single-precision denormals (-ftz) support (0: false, default). + 1 : flushes denormal values to zero + 0 : preserves denormal values + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - + - Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY + Control single-precision floating-point division and reciprocals + (-prec-div) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - + - Set the preferred location for the data as the specified device + Control single-precision floating-point square root + (-prec-sqrt) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - + - Clear the preferred location for the data + Enable/Disable the contraction of floating-point multiplies + and adds/subtracts into floating-point multiply-add (-fma) + operations (1: Enable, default; 0: Disable). + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - - - Data will be accessed by the specified device, so prevent page faults as much as possible + + + Array of kernel names that should be preserved at link time while others + can be removed. + Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries. + Note that kernel names can be mangled by the compiler in which case the + mangled name needs to be specified. + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name. + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all kernels with those names. This can be avoided by providing + a more specific name like "barfoobaz". + Option type: const char ** + Applies to: dynamic linker only + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array. + Option type: unsigned int + Applies to: dynamic linker only + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + Array of variable names (__device__ and/or __constant__) that should be + preserved at link time while others can be removed. + Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries. + Note that variable names can be mangled by the compiler in which case the + mangled name needs to be specified. + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name. + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all variables with those names. This can be avoided by providing + a more specific name like "barfoobaz". + Option type: const char ** + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 - + - Let the Unified Memory subsystem decide on the page faulting policy for the specified device + Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array. + Option type: unsigned int + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 - + - Context Attach flags + This option serves as a hint to enable the JIT compiler/linker + to remove constant (__constant__) and device (__device__) variables + unreferenced in device code (Disabled by default). + Note that host references to constant and device variables using APIs like + ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless + the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES. + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 - + - None + Generate position independent code (0: false) + Option type: int + Applies to: compiler only - + - Device properties + This option hints to the JIT compiler the minimum number of CTAs from the + kernel's grid to be mapped to a SM. Optimizations based on this option + need the maximum number of threads per block to be specified as well. This + option is ignored when used together with ::CU_JIT_MAX_REGISTERS or + ::CU_JIT_THREADS_PER_BLOCK. + Option type: unsigned int + Applies to: compiler only - + - Maximum number of threads per block + Maximum number threads in a thread block, computed as the product of + the maximum extent specifed for each dimension of the block. This limit + is guaranteed not to be exeeded in any invocation of the kernel. Exceeding + the the maximum number of threads results in runtime error or kernel launch + failure. For kernels already using PTX directive.maxntid, this option will + be ignored by default. Use::CU_JIT_OVERRIDE_DIRECTIVE_VALUES to let this + option take precedence over the PTX directive. + Option type: int + Applies to: compiler only - + - Maximum block dimension X + This option lets the values specified using ::CU_JIT_MAX_REGISTERS, + ::CU_JIT_THREADS_PER_BLOCK, ::CU_JIT_MAX_THREADS_PER_BLOCK and + ::CU_JIT_MIN_CTA_PER_SM take precedence over any PTX directives. + (0: Disable, default; 1: Enable) + Option type: int\n + Applies to: compiler only - + - Maximum block dimension Y + Online compilation targets - + - Maximum block dimension Z + Compute device class 3.0 - + - Maximum grid dimension X + Compute device class 3.2 - + - Maximum grid dimension Y + Compute device class 3.5 - + - Maximum grid dimension Z + Compute device class 3.7 - + - Maximum amount of shared memory - available to a thread block in bytes; this amount is shared by all thread blocks simultaneously resident on a - multiprocessor + Compute device class 5.0 - + - Deprecated, use MaxSharedMemoryPerBlock + Compute device class 5.2 - + - Memory available on device for __constant__ variables in a CUDA C kernel in bytes + Compute device class 5.3 + - + - Warp size in threads + Compute device class 6.0 - + - Maximum pitch in bytes allowed by the memory copy functions - that involve memory regions allocated through + Compute device class 6.1 - + - Deprecated, use MaxRegistersPerBlock + Compute device class 6.2. - + - Maximum number of 32-bit registers available - to a thread block; this number is shared by all thread blocks simultaneously resident on a multiprocessor + Compute device class 7.0. - + - Typical clock frequency in kilohertz + Compute device class 7.0. - + - Alignment requirement; texture base addresses - aligned to textureAlign bytes do not need an offset applied to texture fetches + Compute device class 7.5. - + - 1 if the device can concurrently copy memory between host - and device while executing a kernel, or 0 if not + Compute device class 8.0. - + - Number of multiprocessors on device + Compute device class 8.6. - + - Specifies whether there is a run time limit on kernels. - 1 if there is a run time limit for kernels executed on the device, or 0 if not + Compute device class 8.7. - + - Device is integrated with host memory. 1 if the device is integrated with the memory subsystem, or 0 if not + Compute device class 8.9. - + - Device can map host memory into CUDA address space. 1 if the device can map host memory into the - CUDA address space, or 0 if not + Compute device class 9.0. - + - Compute mode (See for details) + Compute device class 10.0. - + - Maximum 1D texture width + Compute device class 10.1. - + - Maximum 2D texture width + Compute device class 12.0. - + - Maximum 2D texture height + Compute device class 9.0. with accelerated features. - + - Maximum 3D texture width + Compute device class 10.0. with accelerated features. - + - Maximum 3D texture height + Compute device class 10.1 with accelerated features. - + - Maximum 3D texture depth + Compute device class 12.0. with accelerated features. - + - Maximum texture array width + Online compilation optimization levels - + - Maximum texture array height + No optimization - + - Maximum slices in a texture array + Optimization level 1 - + - Alignment requirement for surfaces + Optimization level 2 - + - Device can possibly execute multiple kernels concurrently. - 1 if the device supports executing multiple kernels - within the same context simultaneously, or 0 if not. It is not guaranteed that multiple kernels will be resident on - the device concurrently so this feature should not be relied upon for correctness. + Optimization level 3 - + - Device has ECC support enabled. 1 if error correction is enabled on the device, 0 if error correction - is disabled or not supported by the device. + Best, Default - + - PCI bus ID of the device + Caching modes for dlcm - + - PCI device ID of the device + Compile with no -dlcm flag specified - + - Device is using TCC driver model + Compile with L1 cache disabled - + - Peak memory clock frequency in kilohertz + Compile with L1 cache enabled - + - Global memory bus width in bits + Device code formats - + - Size of L2 cache in bytes + Compiled device-class-specific device code + Applicable options: none - + - Maximum resident threads per multiprocessor + PTX source code + Applicable options: PTX compiler options - + - Number of asynchronous engines + Bundle of multiple cubins and/or PTX of some device code + Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - + - Device shares a unified address space with the host + Host object with embedded device code + Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - + - Maximum 1D layered texture width + Archive of host objects with embedded device code + Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY - + - Maximum layers in a 1D layered texture + High-level intermediate code for link-time optimization + Applicable options: NVVM compiler options, PTX compiler options - + - PCI domain ID of the device + Array indices for cube faces - + - Pitch alignment requirement for textures + Positive X face of cubemap - + - Maximum cubemap texture width/height + Negative X face of cubemap - + - Maximum cubemap layered texture width/height + Positive Y face of cubemap - + - Maximum layers in a cubemap layered texture + Negative Y face of cubemap - + - Maximum 1D surface width + Positive Z face of cubemap - + - Maximum 2D surface width + Negative Z face of cubemap - + - Maximum 2D surface height + Limits - + - Maximum 3D surface width + GPU thread stack size - + - Maximum 3D surface height + GPU printf FIFO size - + - Maximum 3D surface depth + GPU malloc heap size - + - Maximum 1D layered surface width + GPU device runtime launch synchronize depth - + - Maximum layers in a 1D layered surface + GPU device runtime pending launch count - + - Maximum 2D layered surface width + A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint - + - Maximum 2D layered surface height + A size in bytes for L2 persisting lines cache size - + - Maximum layers in a 2D layered surface + A maximum size in bytes of shared memory available to CUDA kernels on a CIG context. Can only be queried, cannot be set - + - Maximum cubemap surface width + A non-zero value indicates this CUDA context is a CIG-enabled context. Can only be queried, cannot be set - + - Maximum cubemap layered surface width + When set to zero, CUDA will fail to launch a kernel on a CIG context, instead of using the fallback path, + if the kernel uses more shared memory than available - + - Maximum layers in a cubemap layered surface + Memory types - + - Maximum 1D linear texture width + Host memory - + - Maximum 2D linear texture width + Device memory - + - Maximum 2D linear texture height + Array memory - + - Maximum 2D linear texture pitch in bytes + Unified device or host memory - + - Maximum mipmapped 2D texture width + Resource types - + - Maximum mipmapped 2D texture height + Array resource - + - Major compute capability version number + Mipmapped array resource - + - Minor compute capability version number + Linear resource - + - Maximum mipmapped 1D texture width + Pitch 2D resource - + - Device supports stream priorities + Error codes returned by CUDA driver API calls - + - Device supports caching globals in L1 + No errors - + - Device supports caching locals in L1 + Invalid value - + - Maximum shared memory available per multiprocessor in bytes + Out of memory - + - Maximum number of 32-bit registers available per multiprocessor + Driver not initialized - + - Device can allocate managed memory on this system + Driver deinitialized - + - Device is on a multi-GPU board + This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools + like visual profiler. - + - Unique id for a group of devices on the same multi-GPU board + This error return is deprecated as of CUDA 5.0. It is no longer an error + to attempt to enable/disable the profiling via ::cuProfilerStart or + ::cuProfilerStop without initialization. - + - Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware) + This error return is deprecated as of CUDA 5.0. It is no longer an error + to call cuProfilerStart() when profiling is already enabled. - + - Ratio of single precision performance (in floating-point operations per second) to double precision performance + This error return is deprecated as of CUDA 5.0. It is no longer an error + to call cuProfilerStop() when profiling is already disabled. - + - Device supports coherently accessing pageable memory without calling cudaHostRegister on it + This indicates that the CUDA driver that the application has loaded is a + stub library. Applications that run with the stub rather than a real + driver loaded will result in CUDA API returning this error. - + - Device can coherently access managed memory concurrently with the CPU + This indicates that requested CUDA device is unavailable at the current + time. Devices are often unavailable due to use of + ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED. - + - Device supports compute preemption. + No CUDA-capable device available - + - Device can access host registered memory at the same virtual address as the CPU. + Invalid device - + - ::cuStreamBatchMemOp and related APIs are supported. + This error indicates that the Grid license is not applied. - + - 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. + Invalid kernel image - + - ::CU_STREAM_WAIT_VALUE_NOR is supported. + Invalid context - + - Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel + Context already current - + - Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice + Map failed - + - Maximum optin shared memory per block + Unmap failed - + - Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. + Array is mapped - + - Device supports host memory registration via ::cudaHostRegister. + Already mapped - + - Device accesses pageable memory via the host's page tables. + No binary for GPU - + - The host can directly access managed memory on the device without migration. + Already acquired - + - Max elems... + Not mapped - + - Texture reference filtering modes + Mapped resource not available for access as an array - + - Point filter mode + Mapped resource not available for access as a pointer - + - Linear filter mode + Uncorrectable ECC error detected - + - Function properties + CULimit not supported by device - + - The number of threads beyond which a launch of the function would fail. - This number depends on both the function and the device on which the - function is currently loaded. + This indicates that the passed to the API call can + only be bound to a single CPU thread at a time but is already + bound to a CPU thread. - + - The size in bytes of statically-allocated shared memory required by - this function. This does not include dynamically-allocated shared - memory requested by the user at runtime. + This indicates that peer access is not supported across the given devices. - + - The size in bytes of statically-allocated shared memory required by - this function. This does not include dynamically-allocated shared - memory requested by the user at runtime. + This indicates that a PTX JIT compilation failed. - + - The size in bytes of thread local memory used by this function. + This indicates an error with OpenGL or DirectX context. - + - The number of registers used by each thread of this function. + This indicates that an uncorrectable NVLink error was detected during the execution. - + - The PTX virtual architecture version for which the function was - compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function - would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA - 3.0. + This indicates that the PTX JIT compiler library was not found. - + - The binary version for which the function was compiled. This - value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return - the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary - architecture version. + This indicates that the provided PTX was compiled with an unsupported toolchain. - + - The attribute to indicate whether the function has been compiled with - user specified option "-Xptxas --dlcm=ca" set. + This indicates that the PTX JIT compilation was disabled. - + - The maximum size in bytes of dynamically-allocated shared memory that can be used by - this function. If the user-specified dynamic shared memory size is larger than this - value, the launch will fail. + This indicates that the ::CUexecAffinityType passed to the API call is not supported by the active device. - + - On devices where the L1 cache and shared memory use the same hardware resources, - this sets the shared memory carveout preference, in percent of the total resources. - This is only a hint, and the driver can choose a different ratio if required to execute the function. + This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize. - + - No descritption found... + This indicates that an exception occurred on the device that is now + contained by the GPU's error containment capability. Common causes are - + a. Certain types of invalid accesses of peer GPU memory over nvlink + b. Certain classes of hardware errors + This leaves the process in an inconsistent state and any further CUDA + work will return the same error. To continue using CUDA, the process must + be terminated and relaunched. - + - Function cache configurations + This indicates that the device kernel source is invalid. This includes + compilation/linker errors encountered in device code or user error. - + - No preference for shared memory or L1 (default) + File not found - + - Function prefers larger shared memory and smaller L1 cache. + Link to a shared object failed to resolve - + - Function prefers larger L1 cache and smaller shared memory. + Shared object initialization failed - + - Function prefers equal sized L1 cache and shared memory. + OS call failed - + - Cubin matching fallback strategies + Invalid handle - + - Prefer to compile ptx if exact binary match not found + This indicates that a resource required by the API call is not in a + valid state to perform the requested operation. - + - Prefer to fall back to compatible binary code if exact binary match not found + This indicates an attempt was made to introspect an object in a way that + would discard semantically important information. This is either due to + the object using funtionality newer than the API version used to + introspect it or omission of optional return arguments. - + - Online compiler options + Not found - + - Max number of registers that a thread may use. - Option type: unsigned int - Applies to: compiler only + CUDA not ready - + - IN: Specifies minimum number of threads per block to target compilation - for - OUT: Returns the number of threads the compiler actually targeted. - This restricts the resource utilization fo the compiler (e.g. max - registers) such that a block with the given number of threads should be - able to launch based on register limitations. Note, this option does not - currently take into account any other resource limitations, such as - shared memory utilization. - Option type: unsigned int - Applies to: compiler only + While executing a kernel, the device encountered a + load or store instruction on an invalid memory address. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - Returns a float value in the option of the wall clock time, in - milliseconds, spent creating the cubin - Option type: float - Applies to: compiler and linker + Launch exceeded resources - + - Pointer to a buffer in which to print any log messsages from PTXAS - that are informational in nature (the buffer size is specified via - option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) - Option type: char* - Applies to: compiler and linker + This indicates that the device kernel took too long to execute. This can + only occur if timeouts are enabled - see the device attribute + ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - IN: Log buffer size in bytes. Log messages will be capped at this size - (including null terminator) - OUT: Amount of log buffer filled with messages - Option type: unsigned int - Applies to: compiler and linker + Launch with incompatible texturing - + - Pointer to a buffer in which to print any log messages from PTXAS that - reflect errors (the buffer size is specified via option - ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES) - Option type: char* - Applies to: compiler and linker + This error indicates that a call to is + trying to re-enable peer access to a context which has already + had peer access to it enabled. - + - IN: Log buffer size in bytes. Log messages will be capped at this size - (including null terminator) - OUT: Amount of log buffer filled with messages - Option type: unsigned int - Applies to: compiler and linker + This error indicates that is + trying to disable peer access which has not been enabled yet + via . - + - Level of optimizations to apply to generated code (0 - 4), with 4 - being the default and highest level of optimizations. - Option type: unsigned int - Applies to: compiler only + This error indicates that the primary context for the specified device + has already been initialized. - + - No option value required. Determines the target based on the current - attached context (default) - Option type: No option value needed - Applies to: compiler and linker + This error indicates that the context current to the calling thread + has been destroyed using , or is a primary context which + has not yet been initialized. - + - Target is chosen based on supplied ::CUjit_target_enum. This option cannot be - used with cuLink* APIs as the linker requires exact matches. - Option type: unsigned int for enumerated type ::CUjit_target_enum - Applies to: compiler and linker + A device-side assert triggered during kernel execution. The context + cannot be used anymore, and must be destroyed. All existing device + memory allocations from this context are invalid and must be + reconstructed if the program is to continue using CUDA. - + - Specifies choice of fallback strategy if matching cubin is not found. - Choice is based on supplied ::CUjit_fallback_enum. - Option type: unsigned int for enumerated type ::CUjit_fallback_enum - Applies to: compiler only + This error indicates that the hardware resources required to enable + peer access have been exhausted for one or more of the devices + passed to ::cuCtxEnablePeerAccess(). - + - Specifies whether to create debug information in output (-g) (0: false, default) - Option type: int - Applies to: compiler and linker + This error indicates that the memory range passed to ::cuMemHostRegister() + has already been registered. - + - Generate verbose log messages (0: false, default) - Option type: int - Applies to: compiler and linker + This error indicates that the pointer passed to ::cuMemHostUnregister() + does not correspond to any currently registered memory region. - + - Generate line number information (-lineinfo) (0: false, default) - Option type: int - Applies to: compiler only + While executing a kernel, the device encountered a stack error. + This can be due to stack corruption or exceeding the stack size limit. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - Specifies whether to enable caching explicitly (-dlcm) - Choice is based on supplied ::CUjit_cacheMode_enum. - Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum - Applies to: compiler only + While executing a kernel, the device encountered an illegal instruction. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - The below jit options are used for internal purposes only, in this version of CUDA + While executing a kernel, the device encountered a load or store instruction + on a memory address which is not aligned. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - The below jit options are used for internal purposes only, in this version of CUDA + While executing a kernel, the device encountered an instruction + which can only operate on memory locations in certain address spaces + (global, shared, or local), but was supplied a memory address not + belonging to an allowed address space. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - Array of device symbol names that will be relocated to the corresponing - host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES. - Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. - When loding a device module, driver will relocate all encountered - unresolved symbols to the host addresses. - It is only allowed to register symbols that correspond to unresolved - global variables. - It is illegal to register the same device symbol at multiple addresses. - Option type: const char ** - Applies to: dynamic linker only + While executing a kernel, the device program counter wrapped its address space. + This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - Array of host addresses that will be used to relocate corresponding - device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES. - Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. - Option type: void ** - Applies to: dynamic linker only + An exception occurred on the device while executing a kernel. Common + causes include dereferencing an invalid device pointer and accessing + out of bounds shared memory. This leaves the process in an inconsistent state and any further CUDA work + will return the same error. To continue using CUDA, the process must be terminated + and relaunched. - + - Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and - ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays. - Option type: unsigned int - Applies to: dynamic linker only + This error indicates that the number of blocks launched per grid for a kernel that was + launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice + exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor + or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. - + - Online compilation targets + An exception occurred on the device while exiting a kernel using tensor memory: the + tensor memory was not completely deallocated. This leaves the process in an inconsistent + state and any further CUDA work will return the same error. To continue using CUDA, the + process must be terminated and relaunched. - + - Compute device class 2.0 + This error indicates that the attempted operation is not permitted. - + - Compute device class 2.1 + This error indicates that the attempted operation is not supported + on the current system or device. - + - Compute device class 3.0 + This error indicates that the system is not yet ready to start any CUDA + work. To continue using CUDA, verify the system configuration is in a + valid state and all required driver daemons are actively running. - + - Compute device class 3.2 + This error indicates that there is a mismatch between the versions of + the display driver and the CUDA driver. Refer to the compatibility documentation + for supported versions. - + - Compute device class 3.5 + This error indicates that the system was upgraded to run with forward compatibility + but the visible hardware detected by CUDA does not support this configuration. + Refer to the compatibility documentation for the supported hardware matrix or ensure + that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + environment variable. - + - Compute device class 3.7 + This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. - + - Compute device class 5.0 + This error indicates that the remote procedural call between the MPS server and the MPS client failed. - + - Compute device class 5.2 + This error indicates that the MPS server is not ready to accept new MPS client requests. This error can be returned when the MPS server is in the process of recovering from a fatal failure. - + - Compute device class 5.3 + This error indicates that the hardware resources required to create MPS client have been exhausted. - - + - Compute device class 6.0 + This error indicates the the hardware resources required to support device connections have been exhausted. - + - Compute device class 6.1 + This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. - + - Compute device class 6.2. + This error indicates that the module is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it. - + - Compute device class 7.0. + This error indicates that a module contains an unsupported interaction between different versions of CUDA Dynamic Parallelism. - + - Compute device class 7.5. + This error indicates that the operation is not permitted when the stream is capturing. - + - Online compilation optimization levels + This error indicates that the current capture sequence on the stream + has been invalidated due to a previous error. - + - No optimization + This error indicates that the operation would have resulted in a merge of two independent capture sequences. - + - Optimization level 1 + This error indicates that the capture was not initiated in this stream. - + - Optimization level 2 + This error indicates that the capture sequence contains a fork that was not joined to the primary stream. - + - Optimization level 3 + This error indicates that a dependency would have been created which + crosses the capture sequence boundary. Only implicit in-stream ordering + dependencies are allowed to cross the boundary. - + - Best, Default + This error indicates a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy. - + - Caching modes for dlcm + This error indicates that the operation is not permitted on an event which + was last recorded in a capturing stream. - + - Compile with no -dlcm flag specified + A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED + argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a + different thread. - + - Compile with L1 cache disabled + This error indicates that the timeout specified for the wait operation has lapsed. - + - Compile with L1 cache enabled + This error indicates that the graph update was not performed because it included + changes which violated constraints specific to instantiated graph update. - + - Device code formats + This indicates that an async error has occurred in a device outside of CUDA. + If CUDA was waiting for an external device's signal before consuming shared data, + the external device signaled an error indicating that the data is not valid for + consumption. This leaves the process in an inconsistent state and any further CUDA + work will return the same error. To continue using CUDA, the process must be + terminated and relaunched. - + - Compiled device-class-specific device code - Applicable options: none + Indicates a kernel launch error due to cluster misconfiguration. - + - PTX source code - Applicable options: PTX compiler options + Indiciates a function handle is not loaded when calling an API that requires a loaded function. - + - Bundle of multiple cubins and/or PTX of some device code - Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + This error indicates one or more resources passed in are not valid resource types for the operation. - + - Host object with embedded device code - Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + This error indicates one or more resources are insufficient or non-applicable for the operation. - + - Archive of host objects with embedded device code - Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + This error indicates that an error happened during the key rotation sequence. - + - Array indices for cube faces + Unknown error - + - Positive X face of cubemap + P2P Attributes - + - Negative X face of cubemap + A relative value indicating the performance of the link between two devices - + - Positive Y face of cubemap + P2P Access is enable - + - Negative Y face of cubemap + Atomic operation over the link supported - + - Positive Z face of cubemap + \deprecated use CudaArrayAccessAccessSupported instead - + - Negative Z face of cubemap + Accessing CUDA arrays over the link supported - + - Limits + CUTexRefSetArrayFlags - + - GPU thread stack size + - + - GPU printf FIFO size + Override the texref format with a format inferred from the array. + Flag for . - + - GPU malloc heap size + CUParameterTexRef - + - GPU device runtime launch synchronize depth + For texture references loaded into the module, use default texunit from texture reference. - + - GPU device runtime pending launch count + CUSurfRefSetFlags - + - A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint + Currently no CUSurfRefSetFlags flags are defined. - + - Memory types + Pointer information - + - Host memory + The on which a pointer was allocated or registered - + - Device memory + The describing the physical location of a pointer - + - Array memory + The address at which a pointer's memory may be accessed on the device - + - Unified device or host memory + The address at which a pointer's memory may be accessed on the host - + - Resource types + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + - Array resoure + Synchronize every synchronous memory operation initiated on this region - + - Mipmapped array resource + A process-wide unique ID for an allocated memory region - + - Linear resource + Indicates if the pointer points to managed memory - + - Pitch 2D resource + A device ordinal of a device on which a pointer was allocated or registered - + - Error codes returned by CUDA driver API calls + 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise - + - No errors + Starting address for this requested pointer - + - Invalid value + Size of the address range for this requested pointer - + - Out of memory + 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise - + - Driver not initialized + Bitmask of allowed ::CUmemAllocationHandleType for this allocation - + - Driver deinitialized + 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API - + - This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools - like visual profiler. + Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given - + - This error return is deprecated as of CUDA 5.0. It is no longer an error - to attempt to enable/disable the profiling via ::cuProfilerStart or - ::cuProfilerStop without initialization. + Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. - + - This error return is deprecated as of CUDA 5.0. It is no longer an error - to call cuProfilerStart() when profiling is already enabled. + Size of the actual underlying mapping that the pointer belongs to - + - This error return is deprecated as of CUDA 5.0. It is no longer an error - to call cuProfilerStop() when profiling is already disabled. + The start address of the mapping that the pointer belongs to - + - No CUDA-capable device available + A process-wide unique id corresponding to the physical allocation the pointer belongs to - + - Invalid device + Returns in \p *data a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. - + - Invalid kernel image + CUDA devices corresponding to a D3D11, D3D10 or D3D9 device - + - Invalid context + The CUDA devices for all GPUs used by a D3D11 device. - + - Context already current + The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame (in SLI). - + - Map failed + The CUDA devices for the GPUs to be used by a D3D11 device in the next frame (in SLI). - + - Unmap failed + CUDA devices corresponding to an OpenGL device. - + - Array is mapped + The CUDA devices for all GPUs used by the current OpenGL context - + - Already mapped + The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame - + - No binary for GPU + The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame - + - Already acquired + Shared memory configurations - + - Not mapped + set default shared memory bank size - + - Mapped resource not available for access as an array + set shared memory bank width to four bytes - + - Mapped resource not available for access as a pointer + set shared memory bank width to eight bytes - + - Uncorrectable ECC error detected + CUipcMem_flags - + - CULimit not supported by device + Automatically enable peer access between remote devices as needed - + - This indicates that the passed to the API call can - only be bound to a single CPU thread at a time but is already - bound to a CPU thread. + Resource view format - + - This indicates that peer access is not supported across the given devices. + No resource view format (use underlying resource format) - + - This indicates that a PTX JIT compilation failed. + 1 channel unsigned 8-bit integers - + - This indicates an error with OpenGL or DirectX context. + 2 channel unsigned 8-bit integers - + - This indicates that an uncorrectable NVLink error was detected during the execution. + 4 channel unsigned 8-bit integers - + - This indicates that the PTX JIT compiler library was not found. + 1 channel signed 8-bit integers - + - Invalid source + 2 channel signed 8-bit integers - + - File not found + 4 channel signed 8-bit integers - + - Link to a shared object failed to resolve + 1 channel unsigned 16-bit integers - + - Shared object initialization failed + 2 channel unsigned 16-bit integers - + - OS call failed + 4 channel unsigned 16-bit integers - + - Invalid handle + 1 channel signed 16-bit integers - + - This indicates that a resource required by the API call is not in a - valid state to perform the requested operation. + 2 channel signed 16-bit integers - + - Not found + 4 channel signed 16-bit integers - + - CUDA not ready + 1 channel unsigned 32-bit integers - + - While executing a kernel, the device encountered a - load or store instruction on an invalid memory address. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + 2 channel unsigned 32-bit integers - + - Launch exceeded resources + 4 channel unsigned 32-bit integers - + - This indicates that the device kernel took too long to execute. This can - only occur if timeouts are enabled - see the device attribute - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + 1 channel signed 32-bit integers - + - Launch with incompatible texturing + 2 channel signed 32-bit integers - + - This error indicates that a call to is - trying to re-enable peer access to a context which has already - had peer access to it enabled. + 4 channel signed 32-bit integers - + - This error indicates that is - trying to disable peer access which has not been enabled yet - via . + 1 channel 16-bit floating point - + - This error indicates that the primary context for the specified device - has already been initialized. + 2 channel 16-bit floating point - + - This error indicates that the context current to the calling thread - has been destroyed using , or is a primary context which - has not yet been initialized. + 4 channel 16-bit floating point - + - A device-side assert triggered during kernel execution. The context - cannot be used anymore, and must be destroyed. All existing device - memory allocations from this context are invalid and must be - reconstructed if the program is to continue using CUDA. + 1 channel 32-bit floating point - + - This error indicates that the hardware resources required to enable - peer access have been exhausted for one or more of the devices - passed to ::cuCtxEnablePeerAccess(). + 2 channel 32-bit floating point - + - This error indicates that the memory range passed to ::cuMemHostRegister() - has already been registered. + 4 channel 32-bit floating point - + - This error indicates that the pointer passed to ::cuMemHostUnregister() - does not correspond to any currently registered memory region. + Block compressed 1 - + - While executing a kernel, the device encountered a stack error. - This can be due to stack corruption or exceeding the stack size limit. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 2 - + - While executing a kernel, the device encountered an illegal instruction. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 3 - + - While executing a kernel, the device encountered a load or store instruction - on a memory address which is not aligned. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 4 unsigned - + - While executing a kernel, the device encountered an instruction - which can only operate on memory locations in certain address spaces - (global, shared, or local), but was supplied a memory address not - belonging to an allowed address space. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 4 signed - + - While executing a kernel, the device program counter wrapped its address space. - This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 5 unsigned - + - An exception occurred on the device while executing a kernel. Common - causes include dereferencing an invalid device pointer and accessing - out of bounds shared memory. This leaves the process in an inconsistent state and any further CUDA work - will return the same error. To continue using CUDA, the process must be terminated - and relaunched. + Block compressed 5 signed - + - This error indicates that the number of blocks launched per grid for a kernel that was - launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice - exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor - or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors - as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + Block compressed 6 unsigned half-float - + - This error indicates that the attempted operation is not permitted. + Block compressed 6 signed half-float - + - This error indicates that the attempted operation is not supported - on the current system or device. + Block compressed 7 - + - This error indicates that the system is not yet ready to start any CUDA - work. To continue using CUDA, verify the system configuration is in a - valid state and all required driver daemons are actively running. + Profiler Output Modes - + - This error indicates that the operation is not permitted when the stream is capturing. + Output mode Key-Value pair format. - + - This error indicates that the current capture sequence on the stream - has been invalidated due to a previous error. + Output mode Comma separated values format. - + - This error indicates that the operation would have resulted in a merge of two independent capture sequences. + CUDA Mem Attach Flags - + - This error indicates that the capture was not initiated in this stream. + Memory can be accessed by any stream on any device - + - This error indicates that the capture sequence contains a fork that was not joined to the primary stream. + Memory cannot be accessed by any stream on any device - + - This error indicates that a dependency would have been created which - crosses the capture sequence boundary. Only implicit in-stream ordering - dependencies are allowed to cross the boundary. + Memory can only be accessed by a single stream on the associated device - + - This error indicates a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy. + Occupancy calculator flag - - This error indicates that the operation is not permitted on an event which - was last recorded in a capturing stream. + - + Default behavior - + - Unknown error + Assume global caching is enabled and cannot be automatically turned off - + - P2P Attributes + cudaDataType - + - A relative value indicating the performance of the link between two devices + 16 bit real - + - P2P Access is enable + 16 bit complex - + - Atomic operation over the link supported + 32 bit real - + - \deprecated use CudaArrayAccessAccessSupported instead + 32 bit complex - + - Accessing CUDA arrays over the link supported + 64 bit real - + - CUTexRefSetArrayFlags + 64 bit complex - + - + 8 bit real as a signed integer - + - Override the texref format with a format inferred from the array. - Flag for . + 8 bit complex as a pair of signed integers - + - CUParameterTexRef + 8 bit real as a signed integer - + - For texture references loaded into the module, use default texunit from texture reference. + 8 bit complex as a pair of signed integers - + - CUSurfRefSetFlags + real as a nv_bfloat16 - + - Currently no CUSurfRefSetFlags flags are defined. + complex as a pair of nv_bfloat16 numbers - + - Pointer information + real as a signed 4-bit int - + - The on which a pointer was allocated or registered + complex as a pair of signed 4-bit int numbers - + - The describing the physical location of a pointer + real as a unsigned 4-bit int - + - The address at which a pointer's memory may be accessed on the device + complex as a pair of unsigned 4-bit int numbers - + - The address at which a pointer's memory may be accessed on the host + real as a signed 16-bit int - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + complex as a pair of signed 16-bit int numbers - + - Synchronize every synchronous memory operation initiated on this region + real as a unsigned 16-bit int - + - A process-wide unique ID for an allocated memory region + complex as a pair of unsigned 16-bit int numbers - + - Indicates if the pointer points to managed memory + real as a signed 32-bit int - + - A device ordinal of a device on which a pointer was allocated or registered + complex as a pair of signed 32-bit int numbers - + - CUDA devices corresponding to a D3D11, D3D10 or D3D9 device + real as a unsigned 32-bit int - + - The CUDA devices for all GPUs used by a D3D11 device. + complex as a pair of unsigned 32-bit int numbers - + - The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame (in SLI). + real as a signed 64-bit int - + - The CUDA devices for the GPUs to be used by a D3D11 device in the next frame (in SLI). + complex as a pair of signed 64-bit int numbers - + - CUDA devices corresponding to an OpenGL device. + real as a unsigned 64-bit int - + - The CUDA devices for all GPUs used by the current OpenGL context + complex as a pair of unsigned 64-bit int numbers - - - The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame - + + - + + + + + + + + + + - The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame + Operations for ::cuStreamBatchMemOp - + - Shared memory configurations + Represents a ::cuStreamWaitValue32 operation - + - set default shared memory bank size + Represents a ::cuStreamWriteValue32 operation - + - set shared memory bank width to four bytes + Represents a ::cuStreamWaitValue64 operation - + - set shared memory bank width to eight bytes + Represents a ::cuStreamWriteValue64 operation - + - CUipcMem_flags + Insert a memory barrier of the specified type - + - Automatically enable peer access between remote devices as needed + This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a standalone operation. - + - Resource view format + - + - No resource view format (use underlying resource format) + Whether the range will mostly be read and only occasionally be written to - + - 1 channel unsigned 8-bit integers + The preferred location of the range - + - 2 channel unsigned 8-bit integers + Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device - + - 4 channel unsigned 8-bit integers + The last location to which the range was prefetched - + - 1 channel signed 8-bit integers + The preferred location type of the range - + - 2 channel signed 8-bit integers + The preferred location id of the range - + - 4 channel signed 8-bit integers + The last location type to which the range was prefetched - + - 1 channel unsigned 16-bit integers + The last location id to which the range was prefetched - + - 2 channel unsigned 16-bit integers + Shared memory carveout configurations - + - 4 channel unsigned 16-bit integers + no preference for shared memory or L1 (default) - + - 1 channel signed 16-bit integers + prefer maximum available shared memory, minimum L1 cache - + - 2 channel signed 16-bit integers + prefer maximum available L1 cache, minimum shared memory - + - 4 channel signed 16-bit integers + Graph node types - + - 1 channel unsigned 32-bit integers + GPU kernel node - + - 2 channel unsigned 32-bit integers + Memcpy node - + - 4 channel unsigned 32-bit integers + Memset node - + - 1 channel signed 32-bit integers + Host (executable) node - + - 2 channel signed 32-bit integers + Node which executes an embedded graph - + - 4 channel signed 32-bit integers + Empty (no-op) node - + - 1 channel 16-bit floating point + External event wait node - + - 2 channel 16-bit floating point + External event record node - + - 4 channel 16-bit floating point + External semaphore signal node - + - 1 channel 32-bit floating point + External semaphore wait node - + - 2 channel 32-bit floating point + Memory Allocation Node - + - 4 channel 32-bit floating point + Memory Free Node - + - Block compressed 1 + Batch MemOp Node - + - Block compressed 2 + Conditional Node + May be used to implement a conditional execution path or loop + inside of a graph. The graph(s) contained within the body of the conditional node + can be selectively executed or iterated upon based on the value of a conditional + variable. + Handles must be created in advance of creating the node + using ::cuGraphConditionalHandleCreate. + The following restrictions apply to graphs which contain conditional nodes: + The graph cannot be used in a child node. + Only one instantiation of the graph may exist at any point in time. + The graph cannot be cloned. + To set the control value: + In a kernel or kernels at appropriate locations in the graph, insert a call to + void cudaGraphSetConditional(CUgraphConditionalHandle handle, unsigned int value). + Supply a default value when creating the handle. - + - Block compressed 3 + Possible stream capture statuses returned by ::cuStreamIsCapturing - + - Block compressed 4 unsigned + Stream is not capturing - + - Block compressed 4 signed + Stream is actively capturing - + - Block compressed 5 unsigned + Stream is part of a capture sequence that has been invalidated, but not terminated - + - Block compressed 5 signed + Possible modes for stream capture thread interactions. For more details see ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode - + - Block compressed 6 unsigned half-float + - + - Block compressed 6 signed half-float + - + - Block compressed 7 + - + - Profiler Output Modes + External memory handle types - + - Output mode Key-Value pair format. + Handle is an opaque file descriptor - + - Output mode Comma separated values format. + Handle is an opaque shared NT handle - + - CUDA Mem Attach Flags + Handle is an opaque, globally shared handle - + - Memory can be accessed by any stream on any device + Handle is a D3D12 heap object - + - Memory cannot be accessed by any stream on any device + Handle is a D3D12 committed resource - + - Memory can only be accessed by a single stream on the associated device + Handle is a shared NT handle to a D3D11 resource - + - Occupancy calculator flag + Handle is a globally shared handle to a D3D11 resource - + - Default behavior + Handle is an NvSciBuf object - + - Assume global caching is enabled and cannot be automatically turned off + External semaphore handle types - + - cudaDataType + Handle is an opaque file descriptor - + - 16 bit real + Handle is an opaque shared NT handle - + - 16 bit complex + Handle is an opaque, globally shared handle - + - 32 bit real + Handle is a shared NT handle referencing a D3D12 fence object - + - 32 bit complex + Handle is a shared NT handle referencing a D3D11 fence object - + - 64 bit real + Opaque handle to NvSciSync Object - + - 64 bit complex + Handle is a shared NT handle referencing a D3D11 keyed mutex object - + - 8 bit real as a signed integer + Handle is a globally shared handle referencing a D3D11 keyed mutex object - + - 8 bit complex as a pair of signed integers + Handle is an opaque file descriptor referencing a timeline semaphore - + - 8 bit real as a signed integer + Handle is an opaque shared NT handle referencing a timeline semaphore - + - 8 bit complex as a pair of signed integers + Specifies the type of location - - - - - - - - - - - - - + - Operations for ::cuStreamBatchMemOp + - + - Represents a ::cuStreamWaitValue32 operation + Location is a device location, thus id is a device ordinal - + - Represents a ::cuStreamWriteValue32 operation + Location is host, id is ignored - + - Represents a ::cuStreamWaitValue64 operation + Location is a host NUMA node, thus id is a host NUMA node id - + - Represents a ::cuStreamWriteValue64 operation + Location is a host NUMA node of the current thread, id is ignored - + - This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a standalone operation. + Defines the allocation types available - + - + - Whether the range will mostly be read and only occassionally be written to + This allocation type is 'pinned', i.e. cannot migrate from its current + location while the application is actively using it - + - The preferred location of the range + - + - Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device + The update succeeded - + - The last location to which the range was prefetched + The update failed for an unexpected reason which is described in the return value of the function - + - Shared memory carveout configurations + The update failed because the topology changed - + - no preference for shared memory or L1 (default) + The update failed because a node type changed - + - prefer maximum available shared memory, minimum L1 cache + The update failed because the function of a kernel node changed - + - prefer maximum available L1 cache, minimum shared memory + The update failed because the parameters changed in a way that is not supported - + - Graph node types + The update failed because something about the node is not supported - + - GPU kernel node + The update failed because the function of a kernel node changed in an unsupported way - + - Memcpy node + The update failed because the node attributes changed in a way that is not supported - + - Memset node + Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members - + - Host (executable) node + Normal cache persistence. - + - Node which executes an embedded graph + Streaming access is less likely to persit from cache. - + - Empty (no-op) node + Persisting access is more likely to persist in cache. - + - - - Possible stream capture statuses returned by ::cuStreamIsCapturing - + + - - - Stream is not capturing - + + + + + - + + + + - Stream is actively capturing + Graph kernel node Attributes - + - Stream is part of a capture sequence that has been invalidated, but not terminated + Identifier for ::CUkernelNodeAttrValue::accessPolicyWindow. - + - External memory handle types + Allows a kernel node to be cooperative (see ::cuLaunchCooperativeKernel). - + + + + + + + + + + + + + + + + + + + + + + - Handle is an opaque file descriptor + - + - Handle is an opaque shared NT handle + Identifier for ::CUstreamAttrValue::accessPolicyWindow. - + - Handle is an opaque, globally shared handle + ::CUsynchronizationPolicy for work queued up in this stream - + - Handle is a D3D12 heap object + Specifies compression attribute for an allocation. - + - Handle is a D3D12 committed resource + Allocating non-compressible memory - + - External semaphore handle types + Allocating compressible memory - + - Handle is an opaque file descriptor + - + - Handle is an opaque shared NT handle + - + - Handle is an opaque, globally shared handle + This flag if set indicates that the memory will be used as a tile pool. - + - Handle is a shared NT handle referencing a D3D12 fence object + This flag, if set, indicates that the memory will be used as a buffer for hardware accelerated decompression. - + - Flags to register a graphics resource + Access flags that specify the level of access the current context's device has + on the memory referenced. - + - Specifies no hints about how this resource will be used. - It is therefore assumed that this resource will be read - from and written to by CUDA. This is the default value. + No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations - + - Specifies that CUDA will not write to this resource. + Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. - + - Specifies that CUDA will not read from this resource and - will write over the entire contents of the resource, so - none of the data previously stored in the resource will - be preserved. + Read-write access, the device has full read-write access to the memory - + - Specifies that CUDA will bind this resource to a surface reference. + Sparse subresource types - + - + - Flags for mapping and unmapping graphics interop resources + - + - Specifies no hints about how this resource will be used. - It is therefore assumed that this resource will be read from and written to by CUDA. This is the default value. + Memory operation types - + - Specifies that CUDA will not write to this resource. + - + - Specifies that CUDA will not read from - this resource and will write over the entire contents of the resource, so none of the data previously stored in the - resource will be preserved. + - + - CUTexRefSetFlags + Memory handle types - + - + - Read the texture as integers rather than promoting the values to floats in the range [0,1]. - Flag for + - - - Use normalized texture coordinates in the range [0,1) instead of [0,dim). - Flag for - + + - - - Perform sRGB -> linear conversion during texture read. - + + - - - CUDA driver API initialization flags - + + - - - Currently no initialization flags are defined. - + + - - - CUDA driver API Context Enable Peer Access flags - + + - - - Currently no flags are defined. - + + - - - CUDA stream flags - + + - - - For compatibilty with pre Cuda 5.0, equal to Default - + + - + - Default stream flag + Flags for ::cuStreamUpdateCaptureDependencies - + - Stream does not synchronize with stream 0 (the NULL stream) + Add new nodes to the dependency set - + - CudaCooperativeLaunchMultiDeviceFlags + Replace the dependency set with the new nodes - + - No flags + Flags to specify search options. For more details see ::cuGetProcAddress - + - If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only - waits for prior work in the stream corresponding to that GPU to complete before the - kernel begins execution. + Default search mode for driver symbols. - + - If set, any subsequent work pushed in a stream that participated in a call to - ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on - the GPU corresponding to that stream to complete before it begins execution. + Search for legacy versions of driver symbols. - + - CUDAArray3DFlags + Search for per-thread versions of driver symbols. - + - No flags + Platform native ordering for GPUDirect RDMA writes - + - if set, the CUDA array contains an array of 2D slices and - the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the - number of slices, not the depth of a 3D array. + The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. - + - if set, the CUDA array contains an array of layers where each layer is either a 1D - or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number - of layers, not the depth of a 3D array. + Natively, the device can consistently consume remote writes, although other CUDA devices may not. - + - this flag must be set in order to bind a surface reference - to the CUDA array + Any CUDA device in the system can consistently consume remote writes to this device. - + - If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The - width of such a CUDA array must be equal to its height, and Depth must be six. - If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps - and Depth must be a multiple of six. + The scopes for ::cuFlushGPUDirectRDMAWrites - + - This flag must be set in order to perform texture gather operations on a CUDA array. + Blocks until remote writes are visible to the CUDA device context owning the data. - + - This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. + Blocks until remote writes are visible to all CUDA device contexts. - + - This flag indicates that the CUDA array may be bound as a color target in an external graphics API + The targets for ::cuFlushGPUDirectRDMAWrites - + - CUMemHostAllocFlags. All of these flags are orthogonal to one another: a developer may allocate memory that is portable, mapped and/or - write-combined with no restrictions. + Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. - + - No flags + Execution Affinity Types - + - The memory returned by this call will be considered as pinned memory - by all CUDA contexts, not just the one that performed the allocation. + Create a context with limited SMs. - - - Maps the allocation into the CUDA address space. The device pointer - to the memory may be obtained by calling . This feature is available only on - GPUs with compute capability greater than or equal to 1.1. - + + - + + + + - Allocates the memory as write-combined (WC). WC memory - can be transferred across the PCI Express bus more quickly on some system configurations, but cannot be read - efficiently by most CPUs. WC memory is a good option for buffers that will be written by the CPU and read by - the GPU via mapped pinned memory or host->device transfers. - If set, host memory is allocated as write-combined - fast to write, - faster to DMA, slow to read except via SSE4 streaming load instruction - (MOVNTDQA). + (value type = cuuint64_t) + Amount of memory, in bytes, currently associated with graphs - + - Context creation flags. - The two LSBs of the flags parameter can be used to control how the OS thread, which owns the CUDA context at - the time of an API call, interacts with the OS scheduler when waiting for results from the GPU. + (value type = cuuint64_t) + High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero. - + - The default value if the flags parameter is zero, uses a heuristic based on the - number of active CUDA contexts in the process C and the number of logical processors in the system P. If C > - P, then CUDA will yield to other OS threads when waiting for the GPU, otherwise CUDA will not yield while - waiting for results and actively spin on the processor. + (value type = cuuint64_t) + Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. - + - Instruct CUDA to actively spin when waiting for results from the GPU. This can decrease - latency when waiting for the GPU, but may lower the performance of CPU threads if they are performing - work in parallel with the CUDA thread. + (value type = cuuint64_t) + High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. - + - Instruct CUDA to yield its thread when waiting for results from the GPU. This can - increase latency when waiting for the GPU, but can increase the performance of CPU threads performing work - in parallel with the GPU. + CUDA Lazy Loading status - + - Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the GPU to finish work. + Lazy Kernel Loading is not enabled - + - No description found... + Lazy Kernel Loading is enabled - + - Instruct CUDA to support mapped pinned allocations. This flag must be set in order to allocate pinned host memory that is accessible to the GPU. + Specifies the handle type for address range - + + + + + + + - Instruct CUDA to not reduce local memory after resizing local memory - for a kernel. This can prevent thrashing by local memory allocations when launching many kernels with high - local memory usage at the cost of potentially increased memory usage. + Tensor map data type - - - No description found... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tensor map interleave layout type - + + + + + + + + + + - CUMemHostRegisterFlags. All of these flags are orthogonal to one another: a developer may allocate memory that is portable or mapped - with no restrictions. + Tensor map swizzling mode of shared memory banks - + + + + + + + + + + + + + + + + + + + + + + - No flags + Tensor map L2 promotion type - + + + + + + + + + + + + + - The memory returned by this call will be considered as pinned memory - by all CUDA contexts, not just the one that performed the allocation. + Tensor map out-of-bounds fill type - + + + + + + + - Maps the allocation into the CUDA address space. The device pointer - to the memory may be obtained by calling . This feature is available only on - GPUs with compute capability greater than or equal to 1.1. + Tensor map Im2Col wide mode - + + + + + + + - If set, the passed memory pointer is treated as pointing to some - memory-mapped I/O space, e.g. belonging to a third-party PCIe device. - On Windows the flag is a no-op. - On Linux that memory is marked as non cache-coherent for the GPU and - is expected to be physically contiguous. - On all other platforms, it is not supported and CUDA_ERROR_INVALID_VALUE - is returned. + Flags for ::cuStreamMemoryBarrier - + - Flag for cuStreamAddCallback() + System-wide memory barrier. - + - No flags + Limit memory barrier scope to the GPU. - + - Event creation flags + Graph instantiation results - + - Default event creation flag. + Instantiation succeeded - + - Specifies that event should use blocking synchronization. A CPU thread - that uses to wait on an event created with this flag will block until the event has actually - been recorded. + Instantiation failed for an unexpected reason which is described in the return value of the function - + - Event will not record timing data + Instantiation failed due to invalid structure, such as cycles - + - Event is suitable for interprocess use. CUEventFlags.DisableTiming must be set + Instantiation for device launch failed because the graph contained an unsupported operation - + - Flags for ::cuStreamWaitValue32 + Instantiation for device launch failed due to the nodes belonging to different contexts - + - Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit values). Note this is a cyclic comparison which ignores wraparound. (Default behavior.) + One or more conditional handles are not associated with conditional nodes - + - Wait until *addr == value. + Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute or ::cuKernelSetAttribute - + - Wait until (*addr & value) != 0. + the default policy - + - Wait until ~(*addr | value) != 0. Support for this operation can be - queried with ::cuDeviceGetAttribute() and ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. - Generally, this requires compute capability 7.0 or greater. + spread the blocks within a cluster to the SMs - + - Follow the wait operation with a flush of outstanding remote writes. This - means that, if a remote write operation is guaranteed to have reached the - device before the wait can be satisfied, that write is guaranteed to be - visible to downstream device work. The device is permitted to reorder - remote writes internally. For example, this flag would be required if - two remote writes arrive in a defined order, the wait is satisfied by the - second write, and downstream work needs to observe the first write. + allow the hardware to load-balance the blocks in a cluster to the SMs - + - Flags for ::cuStreamWriteValue32 + - + - Default behavior + - + - Permits the write to be reordered with writes which were issued - before it, as a performance optimization. Normally, ::cuStreamWriteValue32 will provide a memory fence before the - write, which has similar semantics to __threadfence_system() but is scoped to the stream rather than a CUDA thread. + - + - Indicates that the external memory object is a dedicated resource + Launch attributes enum; used as id field of ::CUlaunchAttribute - + - No flags + Ignored entry, for convenient composition - + - Indicates that the external memory object is a dedicated resource + Valid for streams, graph nodes, launches. - + - CUDA stream callback + Valid for graph nodes, launches. - The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. - CUDA_SUCCESS or any persistent error on the stream. - User parameter provided at registration. - + - Block size to per-block dynamic shared memory mapping for a certain - kernel. - e.g.: - If no dynamic shared memory is used: x => 0 - If 4 bytes shared memory per thread is used: x = 4 * x + Valid for streams. - block size - The dynamic shared memory needed by a block - + - CUDA host function + Valid for graph nodes, launches. - Argument value passed to the function - + - An abstraction layer for the CUDA driver API + Valid for graph nodes, launches. - + - Specifies the directX version to use with a cuda context, if necessary + Valid for launches. Setting programmaticStreamSerializationAllowed to non-0 + signals that the kernel will use programmatic means to resolve its stream dependency, so that + the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous + kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait + on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). - + - DirectX9 + Valid for launches. Event recorded through this launch attribute is guaranteed to only trigger + after all block in the associated kernel trigger the event. A block can trigger the event through + PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger + can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. + The dependent launches can choose to wait on the dependency using the programmatic sync + (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the + CPU thread calling cuEventSynchronize()) are not guaranteed to observe the release precisely when it is + released. For example, cuEventSynchronize() may only observe the event trigger long after the associated + kernel has completed. This recording type is primarily meant for establishing programmatic dependency + between device tasks. The event supplied must not be an interprocess or interop event. The event must + disable timing (i.e. created with ::CU_EVENT_DISABLE_TIMING flag set). - + - DirectX10 + Valid for streams, graph nodes, launches. - + - DirectX11 + Valid for streams, graph nodes, launches. See + ::CUlaunchAttributeValue::memSyncDomainMap + + + + + Valid for streams, graph nodes, launches. See + ::CUlaunchAttributeValue::memSyncDomain. + + + + + Valid for graph nodes, launches. Set + ::CUlaunchAttributeValue::preferredClusterDim + to allow the kernel launch to specify a preferred substitute + cluster dimension. Blocks may be grouped according to either + the dimensions specified with this attribute (grouped into a + "preferred substitute cluster"), or the one specified with + ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION attribute (grouped + into a "regular cluster"). The cluster dimensions of a + "preferred substitute cluster" shall be an integer multiple + greater than zero of the regular cluster dimensions. The + device will attempt - on a best-effort basis - to group + thread blocks into preferred clusters over grouping them + into regular clusters. When it deems necessary (primarily + when the device temporarily runs out of physical resources + to launch the larger preferred clusters), the device may + switch to launch the regular clusters instead to attempt to + utilize as much of the physical device resources as possible. + + Each type of cluster will have its enumeration / coordinate + setup as if the grid consists solely of its type of cluster. + For example, if the preferred substitute cluster dimensions + double the regular cluster dimensions, there might be + simultaneously a regular cluster indexed at (1,0,0), and a + preferred cluster indexed at (1,0,0). In this example, the + preferred substitute cluster (1,0,0) replaces regular + clusters (2,0,0) and (3,0,0) and groups their blocks. + + This attribute will only take effect when a regular cluster + dimension has been specified. The preferred substitute + cluster dimension must be an integer multiple greater than + zero of the regular cluster dimension and must divide the + grid. It must also be no more than `maxBlocksPerCluster`, if + it is set in the kernel's `__launch_bounds__`. Otherwise it + must be less than the maximum value the driver can support. + Otherwise, setting this attribute to a value physically + unable to fit on any particular device is permitted. + + + + + Valid for launches. Set + ::CUlaunchAttributeValue::launchCompletionEvent to record the event. + + Nominally, the event is triggered once all blocks of the kernel + have begun execution. Currently this is a best effort. If a kernel + B has a launch completion dependency on a kernel A, B may wait + until A is complete. Alternatively, blocks of B may begin before + all blocks of A have begun, for example if B can claim execution + resources unavailable to A (e.g. they run on different GPUs) or + if B is a higher priority than A. + Exercise caution if such an ordering inversion could lead + to deadlock. + + A launch completion event is nominally similar to a programmatic + event with \c triggerAtBlockStart set except that it is not + visible to \c cudaGridDependencySynchronize() and can be used with + compute capability less than 9.0. + + The event supplied must not be an interprocess or interop + event. The event must disable timing (i.e. must be created + with the ::CU_EVENT_DISABLE_TIMING flag set). + + + + + Valid for graph nodes, launches. This attribute is graphs-only, + and passing it to a launch in a non-capturing stream will result + in an error. + ::CUlaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can + only be set to 0 or 1. Setting the field to 1 indicates that the + corresponding kernel node should be device-updatable. On success, a handle + will be returned via + ::CUlaunchAttributeValue::deviceUpdatableKernelNode::devNode which can be + passed to the various device-side update functions to update the node's + kernel parameters from within another kernel. For more information on the + types of device updates that can be made, as well as the relevant limitations + thereof, see ::cudaGraphKernelNodeUpdatesApply. + + Nodes which are device-updatable have additional restrictions compared to + regular kernel nodes. Firstly, device-updatable nodes cannot be removed + from their graph via ::cuGraphDestroyNode. Additionally, once opted-in + to this functionality, a node cannot opt out, and any attempt to set the + deviceUpdatable attribute to 0 will result in an error. Device-updatable + kernel nodes also cannot have their attributes copied to/from another kernel + node via ::cuGraphKernelNodeCopyAttributes. Graphs containing one or more + device-updatable nodes also do not allow multiple instantiation, and neither + the graph nor its instantiated version can be passed to ::cuGraphExecUpdate. + + If a graph contains device-updatable nodes and updates those nodes from the device + from within the graph, the graph must be uploaded with ::cuGraphUpload before it + is launched. For such a graph, if host-side executable graph updates are made to the + device-updatable nodes, the graph must be uploaded before it is launched again. + + + + + Valid for launches. On devices where the L1 cache and shared memory use the + same hardware resources, setting ::CUlaunchAttributeValue::sharedMemCarveout to a + percentage between 0-100 signals the CUDA driver to set the shared memory carveout + preference, in percent of the total shared memory for that kernel launch. + This attribute takes precedence over ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT. + This is only a hint, and the CUDA driver can choose a different configuration if + required for the launch. + + + + + Library options to be specified with ::cuLibraryLoadData() or ::cuLibraryLoadFromFile() + + + + + + + + + + Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved. + Specifying this option will let the driver know that \p code can be accessed at any point + until ::cuLibraryUnload(). The default behavior is for the driver to allocate and + maintain its own copy of \p code. Note that this is only a memory usage optimization + hint and the driver can choose to ignore it if required. + Specifying this option with ::cuLibraryLoadFromFile() is invalid and + will return ::CUDA_ERROR_INVALID_VALUE. - + - Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + - + - 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + Flags for choosing a coredump attribute to get/set - + - + - + - + - + - + - + + + + + + + - Create a new instace of managed Cuda. Creates a new cuda context. - Using device with ID 0 and + CUDA device NUMA configuration - + - Create a new instace of managed Cuda. - If createNew is true, a new cuda context will be created. - If createNew is false, the CudaContext is bound to an existing cuda context. Creates a new context if no context exists. - Using device with ID 0 and + The GPU is not a NUMA node - - + - Create a new instace of managed Cuda. Creates a new cuda context. - Using + The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID - DeviceID - + - Create a new instace of managed Cuda. - If createNew is true, a new cuda context will be created. - If createNew is false, the CudaContext bounds to an existing cuda context. Creates a new context if no context exists. + CUDA Process States - DeviceID - - + - Create a new instace of managed Cuda. Creates a new cuda context. + Default process state - DeviceID. - Context creation flags. - + - Create a new instace of a cuda context from the given CudaStream + CUDA API locks are taken so further CUDA API calls will block - The stream to query - + - Create a new instace of managed Cuda + Application memory contents have been checkpointed and underlying allocations and device handles have been released - DeviceID. - Context creation flags. - Create a new CUDA context or use an exiting context for the calling thread. Creates a new context if no context exists. - + - Create a new instance of managed CUDA for a given Direct3DX-device. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + Application entered an uncorrectable error during the checkpoint/restore process - Direct3D device - Context creation flags - DirectX Version to bind this context to (9, 10, 11) - + - Create a new instance of managed CUDA for a given Direct3DX-device. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - Use to obtain a list of possible values for cudaDevice. + CUGraphCondAssign - CUdevice to map this context to. Use to obtain a list of possible values - Direct3D device. - Context creation flags - DirectX (9, 10, 11) Version to bind this context to - + - As the normal context constructor has the same arguments, the OpenGL-constructor is private with inverse arguement order. - It has to be called from a static method. - Create a new instance of managed CUDA for a OpenGL-device. - OpenGL resources from this device may be registered and mapped through the lifetime of this CUDA context. + - CUdevice to map this context to. - Context creation flags - + - Create a new instace of managed Cuda, performing no CUDA API calls. Needed for inheritance. + Default value is applied when graph is launched. - Additional constructor parameter to differentiate direct constructor call or inherited call, i.e. called by primaryContext class. - DeviceID. - + - For dispose + Conditional node types - + - Dispose + Conditional'if/else' Node. Body[0] executed if condition is non-zero. If \p size == 2, an optional ELSE graph is created and this is executed if the condition is zero. - + - For IDisposable. - Note: If this instance created the wrapped CUcontext, it will be destroyed and can't be accessed by other threads anymore. - If this instance only was bound to an existing CUcontext, the wrapped CUcontext won't be destroyed. + Conditional 'while' Node. Body executed repeatedly while condition value is non-zero. - - + - Make sure the kernel image arrays are zero terminated by appending a zero + Conditional 'switch' Node. Body[n] is executed once, where 'n' is the value of the condition. If the condition does not match a body index, no body is launched. - + - Gets the context's API version + Type annotations that can be applied to graph edges as part of ::CUgraphEdgeData. - Version - + - Blocks until the device has completed all preceding requested tasks. Throws a if one of the - preceding tasks failed. If the context was created with the flag, the CPU thread will - block until the GPU context has finished its work. + This is an ordinary dependency. - + - Push the CUDA context + This dependency type allows the downstream node to + use cudaGridDependencySynchronize(). It may only be used + between kernel nodes, and must be used with either the + ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC or + ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER outgoing port. - + - Pop the CUDA context + Type annotations that can be applied to graph edges as part of ::CUgraphEdgeData. - + - Binds this CUDA context to the calling CPU thread + This port activates when the kernel has finished executing. - + - Sets the shared memory configuration for the current context. - On devices with configurable shared memory banks, this function will set - the context's shared memory bank size which is used for subsequent kernel - launches. - Changed the shared memory configuration between launches may insert a device - side synchronization point between those launches. - Changing the shared memory bank size will not increase shared memory usage - or affect occupancy of kernels, but may have major effects on performance. - Larger bank sizes will allow for greater potential bandwidth to shared memory, - but will change what kinds of accesses to shared memory will result in bank - conflicts. - This function will do nothing on devices with fixed shared memory bank size. - - The supported bank configurations are: - - : set bank width to the default initial - setting (currently, four bytes). - - : set shared memory bank width to - be natively four bytes. - - : set shared memory bank width to - be natively eight bytes. + This port activates when all blocks of the kernel have performed cudaTriggerProgrammaticLaunchCompletion() + or have terminated. It must be used with edge type ::CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC. See also + ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT. - + - Returns the current shared memory configuration for the current context. + This port activates when all blocks of the kernel have begun execution. See also + ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT. - + - Load a CUBIN-module from file + Types of async notification that can be sent - - - + - Load a PTX module from file + - - - - - + - Load a PTX module from file + - - Collection of linker and compiler options - - + - Load a PTX module from file + - - - + - Load a ptx module from image as byte[] + - - Collection of linker and compiler options - - + - Load a ptx module from image as byte[] + Type of resource - - - - - + - Load a ptx module from image as stream + - - - - - + - Load a ptx module from image as stream + Streaming multiprocessors related information - - Collection of linker and compiler options - - + - Load a ptx module from image as byte[] + - - - + - Load a ptx module from image as stream + D3D12 Command Queue Handle - - - + - Load a CUBIN-module from file and return directly a wrapped CudaKernel + Flags to specify for copies within a batch. For more details see ::cuMemcpyBatchAsync. - Path and name of the module file - The kernel name as defined in the *.cu file - - + - Load a PTX module from file and return directly a wrapped CudaKernel + Default - Path and name of the ptx-module file - The kernel name as defined in the *.cu file - JIT-compile options. Only if module image is a ptx module - JIT-compile options values. Only if module image is a ptx module - - + - Load a PTX module from file and return directly a wrapped CudaKernel + Hint to the driver to try and overlap the copy with compute work on the SMs. - Path and name of the ptx-module file - The kernel name as defined in the *.cu file - Collection of linker and compiler options. Only if module image is a ptx module - - + - Load a PTX module from file and return directly a wrapped CudaKernel + These flags allow applications to convey the source access ordering CUDA must maintain. + The destination will always be accessed in stream order. - Path and name of the ptx-module file - The kernel name as defined in the *.cu file - - + - Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + Default invalid. - Module image (cubin or PTX) as byte[] - The kernel name as defined in the *.cu file - JIT-compile options. Only if module image is a ptx module - JIT-compilt options values. Only if module image is a ptx module - - + - Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + Indicates that access to the source pointer must be in stream order. - Module image (cubin or PTX) as byte[] - The kernel name as defined in the *.cu file - Collection of linker and compiler options. Only if module image is a ptx module - - + - Load a ptx module from image as stream and return directly a wrapped CudaKernel + Indicates that access to the source pointer can be out of stream order and + all accesses must be complete before the API call returns. This flag is suited for + ephemeral sources (ex., stack variables) when it's known that no prior operations + in the stream can be accessing the memory and also that the lifetime of the memory + is limited to the scope that the source variable was declared in. Specifying + this flag allows the driver to optimize the copy and removes the need for the user + to synchronize the stream after the API call. - Module image (cubin or PTX) as stream - The kernel name as defined in the *.cu file - JIT-compile options. Only if module image is a ptx module - JIT-compilt options values. Only if module image is a ptx module - - + - Load a ptx module from image as stream and return directly a wrapped CudaKernel + Indicates that access to the source pointer can be out of stream order and the accesses + can happen even after the API call returns. This flag is suited for host pointers + allocated outside CUDA (ex., via malloc) when it's known that no prior operations + in the stream can be accessing the memory. Specifying this flag allows the driver + to optimize the copy on certain platforms. - Module image (cubin or PTX) as stream - The kernel name as defined in the *.cu file - Collection of linker and compiler options. Only if module image is a ptx module - - + - Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + These flags allow applications to convey the operand type for individual copies specified in ::cuMemcpy3DBatchAsync. - Module image (cubin or PTX) as byte[] - The kernel name as defined in the *.cu file - - + - Load a ptx module from image as stream and return directly a wrapped CudaKernel + Memcpy operand is a valid pointer. - Module image (cubin or PTX) as stream - The kernel name as defined in the *.cu file - - + - Load a FatBinary module from image as byte[] + Memcpy operand is a CUarray. - - - + - Load a FatBinary module from image as stream + Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK. - - - + - Load a FatBinary module from image as byte[] and return directly a wrapped CudaKernel + Decompression is unsupported. - Module image (fat binary) as byte[] - The kernel name as defined in the *.cu file - - + - Load a FatBinary module from image as stream and return directly a wrapped CudaKernel + Deflate is supported. - Module image (fat binary) as stream - The kernel name as defined in the *.cu file - - + - unload module + Snappy is supported. - - + - unload kernel + CUDA array - - + - Allocate memory on the device + - - - + - SetMemory (cuMemsetD8) + Returns the memory requirements of a CUDA array - - - - + - SetMemory (cuMemsetD16) + CUDA linker - - - - + - SetMemory (cuMemsetD32) + - - - - + - SetMemory (cuMemset2DD8) + CUDA mipmapped array - - - - - - + - SetMemory (cuMemset2DD16) + - - - - - - + - SetMemory (cuMemset2DD32) + Returns the memory requirements of a CUDA array - - - - - - + - SetMemory (cuMemsetD8) + Cuda context - - - - - + - SetMemory (cuMemsetD16) + - - - - - + - SetMemory (cuMemsetD32) + Get context resources + Get the \p type resources available to the context represented by \p hCtx + Note: The API is not supported on 32-bit platforms. - - - - - + - SetMemory (cuMemset2DD8) + Cuda device - - - - - - - + - SetMemory (cuMemset2DD16) + - - - - - - - + - SetMemory (cuMemset2DD32) + - - - - - - - + - Free device memory + Device that represents the CPU - - + - Returns the total device memory in bytes + Device that represents an invalid device - - + - Returns the free available device memory in bytes + Sets the current memory pool of a device + The memory pool must be local to the specified device. + ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. + By default, a device's current memory pool is its default memory pool. + + note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different than the one the stream runs on. - - + - Queries if a device may directly access a peer device's memory + Gets the current memory pool of the CUdevice. - - + - On devices where the L1 cache and shared memory use the same hardware - resources, this returns the preferred cache configuration - for the current context. This is only a preference. The driver will use - the requested configuration if possible, but it is free to choose a different - configuration if required to execute functions. - This will return on devices - where the size of the L1 cache and shared memory are fixed. + Gets the default memory pool of the CUdevice. - - + - On devices where the L1 cache and shared memory use the same hardware - resources, this sets through cacheConfig the preferred cache configuration for - the current context. This is only a preference. The driver will use - the requested configuration if possible, but it is free to choose a different - configuration if required to execute the function. Any function preference - set via will be preferred over this context-wide - setting. Setting the context-wide cache configuration to - will cause subsequent kernel launches to prefer - to not change the cache configuration unless required to launch the kernel. - This setting does nothing on devices where the size of the L1 cache and - shared memory are fixed. - Launching a kernel with a different preference than the most recent - preference setting may insert a device-side synchronization point. + Return an UUID for the device (11.4+) + Returns 16-octets identifing the device \p dev in the structure + pointed by the \p uuid.If the device is in MIG mode, returns its + MIG UUID which uniquely identifies the subscribed MIG compute instance. + Returns 16-octets identifing the device \p dev in the structure pointed by the \p uuid. - - + - Copy data from host to device memory + Returns information about the execution affinity support of the device. + Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. + The supported types are: + - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, + or 0 if not; - Destination CUdeviceptr (Pointer to device memory) - Source array - Number of bytes to copy - + - Copy data from host to device memory + Free unused memory that was cached on the specified device for use with graphs back to the OS. + Blocks which are not in use by a graph that is either currently executing or scheduled to execute are freed back to the operating system. - T must be of value type, i.e. a struct - Destination CUdeviceptr (Pointer to device memory) - Source pointer to host memory - + - Copy data from host to device memory + Set asynchronous allocation attributes related to graphs + Valid attributes are: + - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the last time it was reset.High watermark can only be reset to zero. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. - T must be of value type, i.e. a struct - Destination CUdeviceptr (Pointer to device memory) - Source pointer to host memory - + - Copy data from host to device memory + Query asynchronous allocation attributes related to graphs + Valid attributes are: + - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs + - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the last time it was reset.High watermark can only be reset to zero. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + Returns true if both objects are of type CUdevice and if both Pointer member are equal. - Destination CUdeviceptr (Pointer to device memory) - Source array + + - + - Copy data from host to device memory + Overrides object.GetHashCode() - Destination CUdeviceptr (Pointer to device memory) - Source array + - + - Copy data from host to device memory + override ToString() - Destination CUdeviceptr (Pointer to device memory) - Source array + - + - Copy data from host to device memory + Get device resources + Get the \p type resources available to the \p device. + This may often be the starting point for further partitioning or configuring of resources. + Note: The API is not supported on 32-bit platforms. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Pointer to CUDA device memory - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + + + - + - Copy data from host to device memory + Returns true if both objects are of type CUdeviceptr and if both Pointer member is equal. - Destination CUdeviceptr (Pointer to device memory) - Source array + + - + - Copy data from host to device memory + Overrides object.GetHashCode() - Destination CUdeviceptr (Pointer to device memory) - Source array + - + - Copy data from host to device memory + override ToString() - Destination CUdeviceptr (Pointer to device memory) - Source array + - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array + - + - Copy data from host to device memory + The on which a pointer was allocated or registered - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + The describing the physical location of a pointer - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + The address at which a pointer's memory may be accessed on the host - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + A pair of tokens for use with the nv-p2p.h Linux kernel interface - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Synchronize every synchronous memory operation initiated on this region - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + A process-wide unique ID for an allocated memory region - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Indicates if the pointer points to managed memory - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + A device ordinal of a device on which a pointer was allocated or registered - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Starting address for this requested pointer - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Size of the address range for this requested pointer - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Bitmask of allowed ::CUmemAllocationHandleType for this allocation - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Size of the actual underlying mapping that the pointer belongs to - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + The start address of the mapping that the pointer belongs to - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + A process-wide unique id corresponding to the physical allocation the pointer belongs to - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Cuda event - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Cuda function / kernel - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns a module handle + Returns in \p *hmod the handle of the module that function \p hfunc + is located in. The lifetime of the module corresponds to the lifetime of + the context it was loaded in or until the module is explicitly unloaded. + The CUDA runtime manages its own modules loaded into the primary context. + If the handle returned by this API refers to a module loaded by the CUDA runtime, + calling ::cuModuleUnload() on that module will result in undefined behavior. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns the function name for a ::CUfunction handle + Returns in \p **name the function name associated with the function handle \p hfunc. + The function name is returned as a null-terminated string. The returned name is only + valid when the function handle is valid.If the module is unloaded or reloaded, one + must call the API again to get the updated name.This API may return a mangled name if + the function is not declared as having C linkage.If either \p** name or \p hfunc + is NULL, ::CUDA_ERROR_INVALID_VALUE is returned. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns if the function is loaded - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Loads a function + Finalizes function loading for \p function.Calling this API with afully loaded function has no effect. - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Cuda module - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Query lazy loading mode + Returns lazy loading mode. Module loading mode is controlled by CUDA_MODULE_LOADING env variable - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns the number of functions within the module - Destination CUdeviceptr (Pointer to device memory) - Source array - + - Copy data from host to device memory + Returns the function handles within a module. + Returns in \p functions a maximum number of \p numFunctions function handles within \p mod.When + function loading mode is set to LAZY the function retrieved may be partially loaded. The loading + state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function + automatically when called with partially loaded function handle which may incur additional + latency.Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned + function handles become invalid when the module is unloaded. - Destination CUdeviceptr (Pointer to device memory) - Source array + Buffer where the function handles are returned to + Maximum number of function handles may be returned to the buffer - + - Copy data from host to device memory + Returns all the function handles within a module. + When + function loading mode is set to LAZY the function retrieved may be partially loaded. The loading + state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function + automatically when called with partially loaded function handle which may incur additional + latency.Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned + function handles become invalid when the module is unloaded. - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Cuda stream - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Returns the CUDA NULL stream (0) - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Stream handle that can be passed as a CUstream to use an implicit stream + with legacy synchronization behavior. - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Stream handle that can be passed as a CUstream to use an implicit stream + with per-thread synchronization behavior. - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Returns the unique Id associated with the stream handle - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Query the green context associated with a stream + + Returns the CUDA green context that the stream is associated with, or NULL if the stream + is not associated with any green context. + + The stream handle \p hStream can refer to any of the following: + + - a stream created via any of the CUDA driver APIs such as ::cuStreamCreate. + If during stream creation the context that was active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned in \p phCtx. + Otherwise, \p *phCtx is set to NULL instead. + + - special stream such as the NULL stream or ::CU_STREAM_LEGACY. + In that case if context that is active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned. + Otherwise, \p *phCtx is set to NULL instead. + + Passing an invalid handle will result in undefined behavior. - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA texture reference - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA surface reference - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA graphics interop resource (DirectX / OpenGL) - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA texture object - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA surface object - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA definition of UUID - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Interprocess Handle for Events - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Fabric handle - An opaque handle representing a memory allocation + that can be exported to processes in different nodes connected + to the exporting node via the NVSwitch fabric. - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Interprocess Handle for Memory - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + half precission floating point - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + two half precission floating point (x,y) - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + bfloat16 floating point - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + two bfloat16 floating point (x,y) - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA external memory - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA external semaphore - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA graph - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + CUDA graph node - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Returns the type of the Node - Destination CUdeviceptr (Pointer to device memory) - Source value - + - Copy data from host to device memory + Sets the parameters of host node nodeParams. - Destination CUdeviceptr (Pointer to device memory) - Source value + - + - Copy data from host to device memory + Sets the parameters of kernel node nodeParams. - Destination CUdeviceptr (Pointer to device memory) - Source value + - + - Copy data from host to device memory + Sets the parameters of memcpy node nodeParams. - Destination CUdeviceptr (Pointer to device memory) - Source value + - + - Copy data from host to device memory + Sets the parameters of memset node nodeParams. - Destination CUdeviceptr (Pointer to device memory) - Source value + - + - Copy data from device to host memory + Sets an external semaphore signal node's parameters. - T must be of value type, i.e. a struct - Destination data in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Sets an external semaphore wait node's parameters. - T must be of value type, i.e. a struct - Destination data in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Sets a batch mem op node's parameters - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Update's a graph node's parameters + Sets the parameters of graph node \p hNode to \p nodeParams.The node type specified by + \p nodeParams->type must match the type of \p hNode. \p nodeParams must be fully + initialized and all unused bytes (reserved, padding) zeroed. + Modifying parameters is not supported for node types CU_GRAPH_NODE_TYPE_MEM_ALLOC and + CU_GRAPH_NODE_TYPE_MEM_FREE. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Gets the parameters of host node. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Gets the parameters of kernel node. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Gets the parameters of memcpy node. - Destination pointer to host memory - Source CUdeviceptr (Pointer to device memory) - Number of bytes to copy + - + - Copy data from device to host memory + Gets the parameters of memset node. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Gets the external semaphore signal node's parameters. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Gets the external semaphore wait node's parameters. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a memory alloc node's parameters - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a memory free node's parameters - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a batch mem op node's parameters - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Only for ChildGraphNodes - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a node's dependencies. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a node's dependencies. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Returns a node's dependent nodes - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Returns a node's dependent nodes - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Copies attributes from source node to destination node. + Copies attributes from source node \p src to destination node \p dst. Both node must have the same context. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + Destination node - + - Copy data from device to host memory + Queries node attribute. + Queries attribute \p attr from node \p hNode and stores it in corresponding member of \p value_out. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + - + - Copy data from device to host memory + Sets node attribute. + Sets attribute \p attr on node \p hNode from corresponding attribute of value. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + + - + - Copy data from device to host memory + Returns the event associated with an event record node - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Sets an event record node's event - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Returns the event associated with an event wait node - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Sets an event wait node's event - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA executable graph - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA memory pool - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA user object for graphs - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA graph conditional handle - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUlibrary - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Returns the number of kernels within the library - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Retrieve the kernel handles within a library. + Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib. + The returned kernel handle becomes invalid when the library is unloaded. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) + Buffer where the kernel handles are returned to + Maximum number of kernel handles may be returned to the buffer - + - Copy data from device to host memory + Retrieve all the kernel handles within a library. + The returned kernel handle becomes invalid when the library is unloaded. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUkernel - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Allows explicit casting from CUkernel to CUfunction to call context-less kernels - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Get the corresponding CUfunction handle using cuKernelGetFunction - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The number of threads beyond which a launch of the function would fail. + This number depends on both the function and the device on which the + function is currently loaded. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The size in bytes of statically-allocated shared memory required by + this function. This does not include dynamically-allocated shared + memory requested by the user at runtime. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The size in bytes of statically-allocated shared memory required by + this function. This does not include dynamically-allocated shared + memory requested by the user at runtime. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The size in bytes of thread local memory used by this function. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The number of registers used by each thread of this function. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The PTX virtual architecture version for which the function was + compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function + would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA + 3.0. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The binary version for which the function was compiled. This + value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return + the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary + architecture version. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The attribute to indicate whether the function has been compiled with + user specified option "-Xptxas --dlcm=ca" set. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + This maximum size in bytes of + dynamically-allocated shared memory.The value should contain the requested + maximum size of dynamically-allocated shared memory.The sum of this value and + the function attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + The maximal size of requestable dynamic shared memory may differ by GPU + architecture. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + This maximum size in bytes of + dynamically-allocated shared memory.The value should contain the requested + maximum size of dynamically-allocated shared memory.The sum of this value and + the function attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + The maximal size of requestable dynamic shared memory may differ by GPU + architecture. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + On devices where the L1 + cache and shared memory use the same hardware resources, this sets the shared memory + carveout preference, in percent of the total resources.This is only a hint, and the + driver can choose a different ratio if required to execute the function. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + On devices where the L1 + cache and shared memory use the same hardware resources, this sets the shared memory + carveout preference, in percent of the total resources.This is only a hint, and the + driver can choose a different ratio if required to execute the function. - Destination array in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + If this attribute is set, the kernel must launch with a valid cluster size specified. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster width in blocks. The values must either all be 0 or all be positive. + The validity of the cluster dimensions is otherwise checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster width in blocks. The values must either all be 0 or all be positive. + The validity of the cluster dimensions is otherwise checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster height in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster height in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster depth in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The required cluster depth in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Whether the function can be launched with non-portable cluster size. 1 is + allowed, 0 is disallowed. A non-portable cluster size may only function + on the specific SKUs the program is tested on. The launch might fail if + the program is run on a different hardware platform. + CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking + whether the desired size can be launched on the current device. + Portable Cluster Size + A portable cluster size is guaranteed to be functional on all compute + capabilities higher than the target compute capability. The portable + cluster size for sm_90 is 8 blocks per cluster. This value may increase + for future compute capabilities. + The specific hardware unit may support higher cluster sizes that's not + guaranteed to be portable. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Sets the preferred cache configuration for a device kernel. + On devices where the L1 cache and shared memory use the same hardware + resources, this sets through \p config the preferred cache configuration for + the device kernel \p kernel on the requested device \p dev. This is only a preference. + The driver will use the requested configuration if possible, but it is free to choose a different + configuration if required to execute \p kernel. Any context-wide preference + set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel + setting. + Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute + set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made + before or after this API call. + This setting does nothing on devices where the size of the L1 cache and + shared memory are fixed. + Launching a kernel with a different preference than the most recent + preference setting may insert a device-side synchronization point. + The supported cache configurations are: + - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + \note The API has stricter locking requirements in comparison to its legacy counterpart + ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to + set a config on the same device simultaneously, the cache config setting will depend + on the interleavings chosen by the OS scheduler and memory consistency. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) + Requested cache configuration + Device to set attribute of - + - Copy data from device to host memory + Returns the function name for a ::CUkernel handle + Returns in \p** name the function name associated with the kernel handle \p hfunc. + The function name is returned as a null-terminated string. The returned name is only + valid when the kernel handle is valid.If the library is unloaded or reloaded, one + must call the API again to get the updated name.This API may return a mangled name if + the function is not declared as having C linkage.If either \p** name or \p hfunc + is NULL, ::CUDA_ERROR_INVALID_VALUE is returned. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Returns a library handle + Returns in \p pLib the handle of the library for the requested kernel \p kernel - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA graph device node handle - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + CUDA async notification callback handle - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + A green context handle. This handle can be used safely from only one CPU thread at a time. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Generate a resource descriptor + + Generates a resource descriptor with the set of resources specified in \p resources. + The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API. + The API expects \p nbResources == 1, as there is only one type of resource and merging the same + types of resource is currently not supported. + + Note: The API is not supported on 32-bit platforms. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Generate a resource descriptor + + Generates a resource descriptor with the set of resources specified in \p resources. + The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API. + The API expects \p nbResources == 1, as there is only one type of resource and merging the same + types of resource is currently not supported. + + Note: The API is not supported on 32-bit platforms. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Legacy device properties - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Maximum number of threads per block - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Maximum size of each dimension of a block - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Maximum size of each dimension of a grid - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Shared memory available per block in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Constant memory available on device in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Warp size in threads. Also called SIMD width. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through + . - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + 32-bit registers available per block - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Clock frequency in kilohertz - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Alignment requirement for textures. texture base addresses that are aligned to textureAlign bytes do not + need an offset applied to texture fetches. - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + 2D memory copy parameters - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source X in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source Y - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source memory type (host, device, array) - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source host pointer - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source device pointer - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source array reference - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source pitch (ignored when src is array) - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination X in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination Y - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination memory type (host, device, array) - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination host pointer - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination device pointer - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination array reference - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Destination pitch (ignored when dst is array) - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Width of 2D memory copy in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Height of 2D memory copy - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + 3D memory copy parameters - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Copy data from device to host memory + Source X in bytes - Destination value in host memory - Source CUdeviceptr (Pointer to device memory) - + - Returns the device name of the device bound to the actual context + Source Y - Device Name - + - Returns the device's compute capability of the device bound to the actual context + Source Z - Device compute capability - + - Retrieve device properties + Source LOD - DeviceProperties - + - Returns numerical values that correspond to the least and greatest stream priorities. - Returns in leastPriority and greatestPriority the numerical values that correspond - to the least and greatest stream priorities respectively. Stream priorities - follow a convention where lower numbers imply greater priorities. The range of - meaningful stream priorities is given by [greatestPriority, leastPriority]. - If the user attempts to create a stream with a priority value that is - outside the meaningful range as specified by this API, the priority is - automatically clamped down or up to either leastPriority or greatestPriority - respectively. See ::cuStreamCreateWithPriority for details on creating a - priority stream. - A NULL may be passed in for leastPriority or greatestPriority if the value - is not desired. - This function will return '0' in both leastPriority and greatestPriority if - the current context's device does not support stream priorities - (see ::cuDeviceGetAttribute). + Source memory type (host, device, array) - Pointer to an int in which the numerical value for least - stream priority is returned - Pointer to an int in which the numerical value for greatest stream priority is returned - + - Returns the current size of limit. See + Source host pointer - Limit to query - Returned size in bytes of limit - + - Setting limit to value is a request by the application to update the current limit maintained by the context. The - driver is free to modify the requested value to meet h/w requirements (this could be clamping to minimum or maximum - values, rounding up to nearest element size, etc). The application can use to find out exactly what - the limit has been set to. - Setting each has its own specific restrictions, so each is discussed here: - - ValueRestriction - - controls the stack size of each GPU thread. This limit is only applicable to devices - of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less than 2.0 - will result in the error being returned. - - - controls the size of the FIFO used by the printf() device system call. Setting - must be performed before loading any module that uses the printf() device - system call, otherwise will be returned. This limit is only applicable to - devices of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less - than 2.0 will result in the error being returned. - - - controls the size in bytes of the heap used by the ::malloc() and ::free() device system calls. Setting - must be performed before launching any kernel that uses the ::malloc() or ::free() device system calls, otherwise - will be returned. This limit is only applicable to - devices of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less - than 2.0 will result in the error being returned. - - - controls the maximum nesting depth of a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting - this limit must be performed before any launch of a kernel that uses the - device runtime and calls ::cudaDeviceSynchronize() above the default sync - depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail - with error code ::cudaErrorSyncDepthExceeded if the limitation is - violated. This limit can be set smaller than the default or up the maximum - launch depth of 24. When setting this limit, keep in mind that additional - levels of sync depth require the driver to reserve large amounts of device - memory which can no longer be used for user allocations. If these - reservations of device memory fail, ::cuCtxSetLimit will return - , and the limit can be reset to a lower value. - This limit is only applicable to devices of compute capability 3.5 and - higher. Attempting to set this limit on devices of compute capability less - than 3.5 will result in the error being - returned. - - - controls the maximum number of - outstanding device runtime launches that can be made from the current - context. A grid is outstanding from the point of launch up until the grid - is known to have been completed. Device runtime launches which violate - this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when - ::cudaGetLastError() is called after launch. If more pending launches than - the default (2048 launches) are needed for a module using the device - runtime, this limit can be increased. Keep in mind that being able to - sustain additional pending launches will require the driver to reserve - larger amounts of device memory upfront which can no longer be used for - allocations. If these reservations fail, ::cuCtxSetLimit will return - , and the limit can be reset to a lower value. - This limit is only applicable to devices of compute capability 3.5 and - higher. Attempting to set this limit on devices of compute capability less - than 3.5 will result in the error being - returned. - - + Source device pointer - Limit to set - Size in bytes of limit - + - As the normal context constructor has the same arguments, the OpenGL-constructor is private with inverse arguement order. - It has to be called from a static method. - Create a new instance of managed CUDA for a OpenGL-device. - OpenGL resources from this device may be registered and mapped through the lifetime of this CUDA context. + Source array reference - CUdevice to map this context to. - Context creation flags - + - Gets the CUDA devices associated with the current OpenGL context + Must be NULL - SLI parameter - - + - Returns a list of possible CUDA devices to use for a given DirectX device + Source pitch (ignored when src is array) - DirectX device - SLI parameter - DirectX version of the directX device - - + - Returns the Direct3D device against which the CUDA context, bound to the calling thread, - was created. + Source height (ignored when src is array; may be 0 if Depth==1) - - - + - Returns the device name of the device with ID deviceID + Destination X in bytes - - Device Name - + - GPU Architecture definitions + Destination Y - + - returns the best GPU (with maximum GFLOPS) + Destination Z - best GPU - + - returns the best GPU (with maximum GFLOPS). + Destination LOD - Id of the best GPU - + - Returns the device's compute capability of the device with ID deviceID + Destination memory type (host, device, array) - - Device compute capability - + - Returns the version number of the installed cuda driver + Destination host pointer - CUDA driver version - + - Retrieve device properties + Destination device pointer - Device ID - DeviceProperties - + - Get the number of CUDA capable devices + Destination array reference - - + - If both the current context (current to the calling thread) and peerContext are on devices which support unified - addressing (as may be queried using GetDeviceInfo), then - on success all allocations from peerContext will immediately be accessible - by the current context. See \ref CUDA_UNIFIED for additional - details. - Note that access granted by this call is unidirectional and that in order to access - memory from the current context in peerContext, a separate symmetric call - to ::cuCtxEnablePeerAccess() is required. - Returns if indicates - that the CUdevice of the current context cannot directly access memory - from the CUdevice of peerContext. - Throws if direct access of - peerContext from the current context has already been enabled. - Throws if there is no current context, peerContext - is not a valid context, or if the current context is peerContext. + Must be NULL - Peer context to enable direct access to from the current context - - + - Disables direct access to memory allocations in a peer context and unregisters any registered allocations. + Destination pitch (ignored when dst is array) - Peer context to disable direct access to - - + - Fills the CudaDeviceProperties structure + Destination height (ignored when dst is array; may be 0 if Depth==1) - + - Gets the CUdevice for a given device ordinal number + Width of 3D memory copy in bytes - - - + - Initialize the profiling. - Using this API user can initialize the CUDA profiler by specifying - the configuration file, output file and output file format. This - API is generally used to profile different set of counters by - looping the kernel launch. The configFile parameter can be used - to select profiling options including profiler counters. Refer to - the "Compute Command Line Profiler User Guide" for supported - profiler options and counters. - Limitation: The CUDA profiler cannot be initialized with this API - if another profiling tool is already active, as indicated by the - exception . + Height of 3D memory copy - Name of the config file that lists the counters/options for profiling. - Name of the outputFile where the profiling results will be stored. - outputMode - + - Enable profiling. - Enables profile collection by the active profiling tool for the - current context. If profiling is already enabled, then - cuProfilerStart() has no effect. - cuProfilerStart and cuProfilerStop APIs are used to - programmatically control the profiling granularity by allowing - profiling to be done only on selective pieces of code. + Depth of 3D memory copy - + - Disables profile collection by the active profiling tool for the - current context. If profiling is already disabled, then - cuProfilerStop() has no effect. - cuProfilerStart and cuProfilerStop APIs are used to - programmatically control the profiling granularity by allowing - profiling to be done only on selective pieces of code. + 3D memory copy parameters - + - Gets the Cuda context bound to this managed Cuda object + Source X in bytes - + - Gets the Cuda device allocated to the Cuda Context + Source Y - + - Gets the Id of the Cuda device. + Source Z - + - Indicates if the CudaContext instance created the wrapped cuda context (return = true) or if the CudaContext instance was bound to an existing cuda context. + Source LOD - + - Gets the Id of the Cuda device. + Source memory type (host, device, array) - + - Number of channels in array + Source host pointer - + - One channel, e.g. float1, int1, float, int + Source device pointer - + - Two channels, e.g. float2, int2 + Source array reference - + - Four channels, e.g. float4, int4 + Source context (ignored with srcMemoryType is array) - + - An one dimensional CUDA array + Source pitch (ignored when src is array) - + - Creates a new CUDA array. + Source height (ignored when src is array; may be 0 if Depth==1) - - - - + - Creates a new CUDA array from an existing CUarray. - The CUarray won't be destroyed when disposing. - Array properties are obtained by cuArrayGetDescriptor + Destination X in bytes - - + - Creates a new CUDA array from an existing CUarray. - Array properties are obtained by cuArrayGetDescriptor + Destination Y - - The cuArray will be destroyed while disposing, if the CudaArray is the owner - + - For dispose + Destination Z - + - Dispose + Destination LOD - + - For IDisposable + Destination memory type (host, device, array) - - + - Copy data from host to array memory + Destination host pointer - T must be of value type, i.e. a struct - source pointer to host memory - Offset in bytes of destination array - + - Copy data from host to array memory + Destination device pointer - T must be of value type, i.e. a struct - source pointer to host memory - Offset in bytes of destination array - + - Copy data from host to array memory + Destination array reference - Pointer to source data - Number of bytes to copy - Offset in bytes of destination array - + - Copy data from host to array memory + Destination context (ignored with dstMemoryType is array) - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Destination pitch (ignored when dst is array) - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Destination height (ignored when dst is array; may be 0 if Depth==1) - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Width of 3D memory copy in bytes - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Height of 3D memory copy - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Depth of 3D memory copy - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Array descriptor - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Width of array - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Height of array - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Array format - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Channels per array element - Offset in bytes of destination array - source array - + - Copy data from host to array memory + 3D array descriptor - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Width of 3D array - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Height of 3D array - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Depth of 3D array - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Array format - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Channels per array element - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Flags - Offset in bytes of destination array - source array - + - Copy data from host to array memory + Idea of a SizeT type from http://blogs.hoopoe-cloud.com/index.php/tag/cudanet/, entry from Tuesday, September 15th, 2009 - Offset in bytes of destination array - source array - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator + on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator + on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator + on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator + on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator + on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator - on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator - on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator - on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator - on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator - on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator * on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator * on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator * on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from host to array memory + Define operator * on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - source array + + + - + - Copy data from array to host memory + Define operator * on converted to ulong values to avoid fall back to int - T must be of value type, i.e. a struct - Destination pointer to host memory - Offset in bytes of destination array - - + + + + + - Copy data from array to host memory + Define operator / on converted to ulong values to avoid fall back to int - T must be of value type, i.e. a struct - Destination pointer to host memory - Offset in bytes of destination array + + + - + - Copy data from array to host memory + Define operator / on converted to ulong values to avoid fall back to int - Pointer to Destination data - Number of bytes to copy - Offset in bytes of destination array + + + - + - Copy data from array to host memory + Define operator / on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator / on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator / on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator > on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator > on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator > on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator > on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator > on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator < on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator < on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator < on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator < on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + Define operator < on converted to ulong values to avoid fall back to int - Offset in bytes of destination array - Destination array + + + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + + - + - Copy data from array to host memory + returns this.value.ToString() - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + Returns this.value.GetHashCode() - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + Inner struct for CudaResourceDesc - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Device pointer - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Array format - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Channels per array element - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Size in bytes - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Inner struct for CudaResourceDesc - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Device pointer - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Array format - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Channels per array element - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Width of the array in elements - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Height of the array in elements - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Pitch between two rows in bytes - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Mimics the union "CUDA_RESOURCE_DESC.res" in cuda.h - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + CUDA array - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + CUDA mipmapped array - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Linear memory - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + Linear pitched 2D memory - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + CUDA Resource descriptor - Offset in bytes of destination array - Destination array - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to host memory + - Offset in bytes of destination array - Destination array + - + - Copy data from array to array + - Destination array - source array - Size of memory copy in bytes - Offset in bytes of destination array - Offset in bytes of source array + - + - Copy data from array to array + - Destination array - Size of memory copy in bytes - Offset in bytes of destination array - Offset in bytes of source array + - + - Copy data from array to array + - Destination array - Size of memory copy in bytes - Offset in bytes of destination array - Offset in bytes of source array + - + - Copy data from array to device + - DevicePointer to copy data to - number of bytes to copy - Offset in bytes of source array + - + - Copy data from device to array + - DevicePointer to copy data from - number of bytes to copy - Offset in bytes of source array + - + - Returns the array width in elements + + - + - Returns the array width in bytes + + - + - Returns the wrapped CUarray + + - + - Returns the wrapped CUDAArrayDescriptor + + - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + - + - Number of channels in array + + - + - One channel, e.g. float1, int1, float, int + + - + - Two channels, e.g. float2, int2 + + - + - Four channels, e.g. float4, int4 + + - + - A two dimensional CUDA array + + - + - Creates a new CUDA array. + - - In elements - In elements - + - + - Creates a new CUDA array from an existing CUarray. - The CUarray won't be destroyed when disposing. - Array properties are obtained by cuArrayGetDescriptor + - + - + - Creates a new CUDA array from an existing CUarray. - Array properties are obtained by cuArrayGetDescriptor + - - The cuArray will be destroyed while disposing if the CudaArray is the owner + - + - For dispose + + - + - Dispose + + - + - For IDisposable + - + - + - A raw data copy method + - 2D copy paramters + - + - A raw unaligned copy method + - + - + - Copy from Host to this array + - Source - + - + - Copy data from this array to host + - IntPtr to destination in host memory - + - + - Copy from Host to this array + - Host array base type - Source + - + - Copy data from this array to host + - Host array base type - Destination + - + - Copy from a pitched device variable to this array + - device variable base type - Source + - + - Copy from this array to a pitched device variable + - device variable base type - Destination + - + - Copy array to array + Resource type - - + - Copy array to array + Mimics the union in C++ - - + - Returns the wrapped CUarray + Flags (must be zero) - + - Returns the wrapped CUDAArrayDescriptor + Texture descriptor - + - Returns the Height of the array + Creates a new CudaTextureDescriptor + Address modes for all dimensions + Filter mode + Flags - + - Returns the array width in elements + Creates a new CudaTextureDescriptor + Address modes for all dimensions + Filter mode + Flags + borderColor (array of size 4) - + - Returns the array width in bytes + Creates a new CudaTextureDescriptor + Address modes for dimension 0 + Address modes for dimension 1 + Address modes for dimension 2 + Filter mode + Flags - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Creates a new CudaTextureDescriptor + Address modes for dimension 0 + Address modes for dimension 1 + Address modes for dimension 2 + Filter mode + Flags + borderColor (array of size 4) - + - Number of channels in array + Creates a new CudaTextureDescriptor + Address modes for all dimensions + Filter mode + Flags + Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic + filtering. This value will be clamped to the range [1,16]. + Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between + two defined mipmap levels. + Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. + Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. + Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - + - One channel, e.g. float1, int1, float, int + Creates a new CudaTextureDescriptor + Address modes for all dimensions + Filter mode + Flags + Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic + filtering. This value will be clamped to the range [1,16]. + Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between + two defined mipmap levels. + Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. + Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. + Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. + borderColor (array of size 4) - + - Two channels, e.g. float2, int2 + Creates a new CudaTextureDescriptor + Address modes for dimension 0 + Address modes for dimension 1 + Address modes for dimension 2 + Filter mode + Flags + Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic + filtering. This value will be clamped to the range [1,16]. + Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between + two defined mipmap levels. + Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. + Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. + Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - + - Four channels, e.g. float4, int4 + Creates a new CudaTextureDescriptor + Address modes for dimension 0 + Address modes for dimension 1 + Address modes for dimension 2 + Filter mode + Flags + Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic + filtering. This value will be clamped to the range [1,16]. + Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between + two defined mipmap levels. + Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. + Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. + Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. + borderColor (array of size 4) - + - A three dimensional CUDA array + Address modes - + - Creates a new CUDA array. + Filter mode - - In elements - In elements - In elements - - - + - Creates a new CUDA array from an existing CUarray. - The CUarray won't be destroyed when disposing. - Array properties are obtained by cuArrayGetDescriptor + Flags - - + - Creates a new CUDA array from an existing CUarray. - Array properties are obtained by cuArrayGetDescriptor + Maximum anisotropy ratio. Specifies the maximum anistropy ratio to be used when doing anisotropic + filtering. This value will be clamped to the range [1,16]. - - The cuArray will be destroyed while disposing, if the CudaArray is the owner - + - For dispose + Mipmap filter mode. Specifies the filter mode when the calculated mipmap level lies between + two defined mipmap levels. - + - Dispose + Mipmap level bias. Specifies the offset to be applied to the calculated mipmap level. - + - For IDisposable + Mipmap minimum level clamp. Specifies the lower end of the mipmap level range to clamp access to. - - + - A raw data copy method + Mipmap maximum level clamp. Specifies the upper end of the mipmap level range to clamp access to. - 3D copy paramters - + - Copy from Host to this array + Border Color - Source - - + - Copy data from this array to host + Resource view descriptor - IntPtr to destination in host memory - - + - Copy from Host to this array + Resource view format - Host array base type - Source - + - Copy data from this array to host + Width of the resource view - Host array base type - Destination - + - Copy from a pitched device variable to this array + Height of the resource view - Source - - + - Copy from a pitched device variable to this array + Depth of the resource view - Source - - Pitch in bytes - + - Copy from this array to a pitched device variable + First defined mipmap level - Destination - - + - Copy from this array to a pitched device variable + Last defined mipmap level - Destination - - Pitch in bytes - + - Copy array to array + First layer index - - + - Copy array to array + Last layer index - - + - Returns the wrapped CUarray + GPU Direct v3 tokens - + - Returns the wrapped CUDAArray3DDescriptor + - + - Returns the Depth of the array + - + - Returns the Height of the array + Per-operation parameters for ::cuStreamBatchMemOp - + + + + + + + + + + + + + + + + + + + + + + - Returns the array width in elements + For driver internal use. Initial value is unimportant. - + + + + + + + + + + + + + + + + - Returns the array width in bytes + For driver internal use. Initial value is unimportant. - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + CudaBatchMemOpNodeParams - + + + + + + + + + + - Wrapps a CUevent handle. + CudaBatchMemOpNodeParams (V1 and V2) - + + + + + + + + + + + + + - Creates a new Event using + Kernel launch parameters - + - Creates a new Event + Kernel to launch - Parameters for event creation - + - For dispose + Width of grid in blocks - + - Dispose + Height of grid in blocks - + - For IDisposable + Depth of grid in blocks - - + - returns the wrapped CUevent handle + X dimension of each thread block - + - Records an event. If stream is non-zero, the event is recorded after all preceding operations in the stream have been - completed; otherwise, it is recorded after all preceding operations in the CUDA context have been completed. Since - operation is asynchronous, and/or must be used to determine when the event - has actually been recorded. - If has previously been called and the event has not been recorded yet, this function throws - . + Y dimension of each thread block - + - Records an event. If stream is non-zero, the event is recorded after all preceding operations in the stream have been - completed; otherwise, it is recorded after all preceding operations in the CUDA context have been completed. Since - operation is asynchronous, and/or must be used to determine when the event - has actually been recorded. - If has previously been called and the event has not been recorded yet, this function throws - . - + Z dimension of each thread block - + - Waits until the event has actually been recorded. If has been called on this event, the function returns - . Waiting for an event that was created with the - flag will cause the calling CPU thread to block until the event has actually been recorded. - If has previously been called and the event has not been recorded yet, this function throws . + Dynamic shared-memory size per thread block in bytes - + - Returns true if the event has actually been recorded, or false if not. If - has not been called on this event, the function throws . + Stream identifier - - + - Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds). If - either event has not been recorded yet, this function throws . If either event has been - recorded with a non-zero stream, the result is undefined. + Array of pointers to kernel parameters - - - - + - Represents a Cuda graph. On disose() all graph nodes will be distroyed, too! + GPU kernel node parameters (V2 and V3) - + - Creates a new CudaGraph + Kernel to launch - + - For clone graph method + Width of grid in blocks - + - For dispose + Height of grid in blocks - + - Dispose + Depth of grid in blocks - + - For IDisposable + X dimension of each thread block - - + - Creates an empty node and adds it to a graph - Creates a new node which performs no operation, and adds it to to the graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - - An empty node performs no operation during execution, but can be used for - transitive ordering. For example, a phased execution graph with 2 groups of n - nodes with a barrier between them can be represented using an empty node and - 2*n dependency edges, rather than no empty node and n^2 dependency edges. + Y dimension of each thread block - can be null - A handle to the new node will be returned. - + - Creates a memset node and adds it to a graph - Creates a new memset node and adds it to graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - The element size must be 1, 2, or 4 bytes. - When the graph is launched, the node will perform the memset described by memsetParams. + Z dimension of each thread block - can be null - When the graph is launched, the node will perform the memset described by memsetParams. - Cuda context used for the operation - A handle to the new node will be returned. - + - Creates a memset node and adds it to a graph - Creates a new memset node and adds it to graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - The element size must be 1, 2, or 4 bytes. - When the graph is launched, the node will perform the memset described by memsetParams. + Dynamic shared-memory size per thread block in bytes - can be null - When the graph is launched, the node will perform the memset on deviceVariable. - Value to set - Cuda context used for the operation - A handle to the new node will be returned. - + - Creates a memset node and adds it to a graph - Creates a new memset node and adds it to graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - The element size must be 1, 2, or 4 bytes. - When the graph is launched, the node will perform the memset described by memsetParams. + Array of pointers to kernel parameters - can be null - When the graph is launched, the node will perform the memset on deviceVariable. - Value to set - Cuda context used for the operation - A handle to the new node will be returned. - + - Creates a memcpy node and adds it to a graph - Creates a new memcpy node and adds it to graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - A handle to the new node will be returned. - When the graph is launched, the node will perform the memcpy described by copyParams. - See ::cuMemcpy3D() for a description of the structure and its restrictions. - Memcpy nodes have some additional restrictions with regards to managed memory, if the - system contains at least one device which has a zero value for the device attribute - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer - to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed - for those operand(s). The managed memory will be treated as residing on either the - host or the device, depending on which memory type is specified. + Extra options - can be null - Parameters for the memory copy - Cuda context used for the operation - A handle to the new node will be returned. - + - Creates a kernel execution node and adds it to a graph - Creates a new kernel execution node and adds it to the graph with - dependencies specified via dependencies and arguments specified in nodeParams. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - A handle to the new node will be returned. + Kernel to launch, will only be referenced if func is NULL - can be null - Parameters for the GPU execution node - A handle to the new node will be returned. - + - Creates a kernel execution node and adds it to a graph - Creates a new kernel execution node and adds it to the graph with - dependencies specified via dependencies and arguments specified in nodeParams. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - A handle to the new node will be returned. + Context for the kernel task to run in. The value NULL will indicate the current context should be used by the api. This field is ignored if func is set. - can be null - Kernel to execute - Kernel parameters to pass. An Array of IntPtr each of them pointing to a parameters. Note that the parameters must be pinned by GC! - Extra data - A handle to the new node will be returned. - + - Creates a child graph node and adds it to a graph - Creates a new node which executes an embedded graph, and adds it to this Graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - A handle to the new node will be returned. - The node executes an embedded child graph. The child graph is cloned in this call. + Memset node parameters (V1) - can be null - - A handle to the new node will be returned. - + - Creates a host execution node and adds it to a graph - Creates a new CPU execution node and adds it to the graph with - dependencies specified via dependencies. - It is possible for dependencies to be null, in which case the node will be placed - at the root of the graph. Dependencies may not have any duplicate entries. - A handle to the new node will be returned. - When the graph is launched, the node will invoke the specified CPU function. + Destination device pointer - can be null - Host function to execute - User data for host function. Note that the data object must be pinned by GC! - A handle to the new node will be returned. - + - Clones a graph - This function creates a copy of the original Graph. - All parameters are copied into the cloned graph. The original graph may be modified - after this call without affecting the clone. - Child graph nodes in the original graph are recursively copied into the clone. + Pitch of destination device pointer. Unused if height is 1 - + - Finds a cloned version of a node - This function returns the node corresponding to originalNode - in the original graph. - This cloned graph must have been cloned from the original Graph via its Clone() method. - OriginalNode must have been in that graph at the time of the call to - Clone(), and the corresponding cloned node in this graph must not have - been removed. The cloned node is then returned. + Value to be set - - + - Returns a graph's nodes + Size of each element in bytes. Must be 1, 2, or 4. - - + - Returns a graph's root nodes + Width of the row in elements - - + - Returns a graph's dependency edges + Number of rows - - - + - Adds dependency edges to a graph - Elements in from and to at corresponding indices define a dependency. - Each node in from and to must belong to this Graph. - Specifying an existing dependency will return an error. + Initialises the struct - - + + + + - + - Removes dependency edges to a graph - Elements in from and to at corresponding indices define a dependency. - Each node in from and to must belong to this Graph. - Specifying an existing dependency will return an error. + Initialises the struct - - + + + + - + - Creates an executable graph from a graph - Instantiates this Graph as an executable graph. The graph is validated for any - structural constraints or intra-node constraints which were not previously - validated. If instantiation is successful, a handle to the instantiated graph - is returned. + Memset node parameters (V2) - + - Returns the inner graph handle + Destination device pointer - + - Represents an executable Cuda graph. + Pitch of destination device pointer. Unused if height is 1 - + - For clone graph method + Value to be set - + - For dispose + Size of each element in bytes. Must be 1, 2, or 4. - + - Dispose + Width of the row in elements - + - For IDisposable + Number of rows - - + - Launches an executable graph in a stream. - Only one instance of GraphExec may be executing - at a time. Each launch is ordered behind both any previous work in Stream - and any previous launches of GraphExec.To execute a graph concurrently, it must be - instantiated multiple times into multiple executable graphs. + Context on which to run the node - - + - Returns the inner executable graph handle + Initialises the struct + + + + + - + - A list of JIT compiler / linker option passed to Cuda. - If buffer options are used (i.e. InfoLogBuffer and ErrorLogBuffer), this - collection should only be used once as buffer size is overwritten by Cuda. - To copy data from unmanaged to managed memory, call after - the API call that produced output data. - Maximum number of options is limited to 30. + Initialises the struct + + + + + - - - - + - Add a single option to the collection. + Memcpy node parameters - Option to add - + - A multiple options to the collection. + Must be zero - Options to add - + - Copy data from unmanaged to managed memory + Must be zero - + - Reset values returned from Cuda API for info and error buffers. + Context on which to run the node - + - For dispose + Parameters for the memory copy - + - Dispose + Initialises the struct for copy device memory to device memory - + - For IDisposable + Initialises the struct for copy array3d to device memory - - + - Online compiler options + Initialises the struct for copy array3d to device memory - + - Option value converted to (void *) + Initialises the struct for copy array3d to device memory - + - Option + Initialises the struct for copy array3d to device memory - - - - + - For dispose + Initialises the struct for copy array3d to host memory - + - Dispose + Initialises the struct for copy array3d to host memory - + - For IDisposable + Initialises the struct for copy device memory to device memory - - + - Max number of registers that a thread may use. - Option type: unsigned int - Applies to: compiler only + Initialises the struct for copy host memory to device memory - + - Max number of registers that a thread may use. - Option type: unsigned int - Applies to: compiler only + Initialises the struct for copy device memory to host memory - - + - IN: Specifies minimum number of threads per block to target compilation - for - OUT: Returns the number of threads the compiler actually targeted. - This restricts the resource utilization fo the compiler (e.g. max - registers) such that a block with the given number of threads should be - able to launch based on register limitations. Note, this option does not - currently take into account any other resource limitations, such as - shared memory utilization. - Option type: unsigned int - Applies to: compiler only + Initialises the struct for copy host memory to device memory - + - IN: Specifies minimum number of threads per block to target compilation - for - OUT: Returns the number of threads the compiler actually targeted. - This restricts the resource utilization fo the compiler (e.g. max - registers) such that a block with the given number of threads should be - able to launch based on register limitations. Note, this option does not - currently take into account any other resource limitations, such as - shared memory utilization. - Option type: unsigned int - Applies to: compiler only + Initialises the struct for copy device memory to host memory - - + - Returns the number of threads the compiler actually targeted. - This restricts the resource utilization fo the compiler (e.g. max - registers) such that a block with the given number of threads should be - able to launch based on register limitations. Note, this option does not - currently take into account any other resource limitations, such as - shared memory utilization. - The value is only valid after a succesful call to + Host node parameters (V1 and V2) - + - Returns a float value in the option of the wall clock time, in - milliseconds, spent creating the cubin - Option type: float - Applies to: compiler and linker + The function to call when the node executes - + - Returns a float value in the option of the wall clock time, in - milliseconds, spent creating the cubin - Option type: float - Applies to: compiler and linker + Argument to pass to the function - + - Returns a float value in the option of the wall clock time, in - milliseconds, spent creating the cubin - Option type: float - Applies to: compiler and linker - The value is only valid after a succesful call to + Host node parameters (V1 and V2) - + - Pointer to a buffer in which to print any log messsages from PTXAS - that are informational in nature (the buffer size is specified via - option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) - Option type: char* - Applies to: compiler and linker - You must free the internal buffer array manually after use by calling ! + The function to call when the node executes - + - Pointer to a buffer in which to print any log messsages from PTXAS - that are informational in nature - Option type: char* - Applies to: compiler and linker - You must free the internal buffer array manually after use by calling ! + Argument to pass to the function - Size of the internal buffer array - + - ManagedCuda allocates an byte array as buffer and pins it in order to pass it to Cuda. - You must free the buffer manually if the buffer is not needed anymore. + Win32 handle referencing the semaphore object. Valid when + type is one of the following: + - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP + - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + Exactly one of 'handle' and 'name' must be non-NULL. If + type is + ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + then 'name' must be NULL. - + - Returns the buffer converted to string. - The value is only valid after a succesful call to + Valid NT handle. Must be NULL if 'name' is non-NULL - + - + Name of a valid memory object. Must be NULL if 'handle' is non-NULL. - - + - Pointer to a buffer in which to print any log messages from PTXAS that - reflect errors - Option type: char* - Applies to: compiler and linker - You must free the internal buffer array manually after use by calling ! - + - Pointer to a buffer in which to print any log messages from PTXAS that - reflect errors - Option type: char* - Applies to: compiler and linker - You must free the internal buffer array manually after use by calling ! + File descriptor referencing the memory object. Valid when type is CUDA_EXTERNAL_MEMORY_DEDICATED - - + - ManagedCuda allocates an byte array as buffer and pins it in order to pass it to Cuda. - You must free the buffer manually if the buffer is not needed anymore. + Win32 handle referencing the semaphore object. - + - Returns the buffer converted to string. - The value is only valid after a succesful call to + A handle representing an NvSciBuf Object.Valid when type + is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF - + - + External memory handle descriptor - - + - Level of optimizations to apply to generated code (0 - 4), with 4 - being the default and highest level of optimizations. - Option type: unsigned int - Applies to: compiler only + Type of the handle - + - Level of optimizations to apply to generated code (0 - 4), with 4 - being the default and highest level of optimizations. - Option type: unsigned int - Applies to: compiler only + - Level of optimizations to apply to generated code (0 - 4), with 4 - being the default and highest level of optimizations. - + - No option value required. Determines the target based on the current - attached context (default) - Option type: No option value needed - Applies to: compiler and linker + Size of the memory allocation - + - Determines the target based on the current attached context (default) - Option type: No option value needed - Applies to: compiler and linker + Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED - + - Target is chosen based on supplied . - Option type: unsigned int for enumerated type - Applies to: compiler and linker + External semaphore handle descriptor - + - Target is chosen based on supplied ::CUjit_target_enum. - Option type: unsigned int for enumerated type ::CUjit_target_enum - Applies to: compiler and linker + Type of the handle - - + - Specifies choice of fallback strategy if matching cubin is not found. - Choice is based on supplied . - Option type: unsigned int for enumerated type - Applies to: compiler only + - + - Specifies choice of fallback strategy if matching cubin is not found. - Choice is based on supplied . - Option type: unsigned int for enumerated type - Applies to: compiler only + Flags reserved for the future. Must be zero. - - + - Specifies whether to create debug information in output (-g) (0: false, default) - Option type: int - Applies to: compiler and linker + External memory buffer descriptor - + - Specifies whether to create debug information in output (-g) (0: false, default) - Option type: int - Applies to: compiler and linker + Offset into the memory object where the buffer's base is - - + - Generate verbose log messages (0: false, default) - Option type: int - Applies to: compiler and linker + Size of the buffer - + - Generate verbose log messages (0: false, default) - Option type: int - Applies to: compiler and linker + Flags reserved for future use. Must be zero. - - + - Generate line number information (-lineinfo) (0: false, default) - Option type: int - Applies to: compiler only + External memory mipmap descriptor - + - Generate line number information (-lineinfo) (0: false, default) - Option type: int - Applies to: compiler only + Offset into the memory object where the base level of the mipmap chain is. - - + - Specifies whether to enable caching explicitly (-dlcm) - Choice is based on supplied . - Option type: unsigned int for enumerated type - Applies to: compiler only + Format, dimension and type of base level of the mipmap chain - + - Specifies whether to enable caching explicitly (-dlcm) - Choice is based on supplied . - Option type: unsigned int for enumerated type - Applies to: compiler only + Total number of levels in the mipmap chain - - + - A pending JIT linker invocation. + External semaphore signal parameters - + - Creates a pending JIT linker invocation. + Parameters for fence objects - + - Creates a pending JIT linker invocation. + Value of fence to be signaled - Collection of linker and compiler options - + - For dispose + Value of fence to be signaled - + + + + - Dispose - Destroys state for a JIT linker invocation. + Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType + is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - + - For IDisposable. - Destroys state for a JIT linker invocation. + - - + + + + - Add an input to a pending linker invocation. + Parameters for keyed mutex objects - The input data. PTX must be NULL-terminated. - The type of the input data. - An optional name for this input in log messages. - Collection of linker and compiler options - + - Add an input to a pending linker invocation. + Value of key to acquire the mutex with - The input data. PTX must be NULL-terminated. - The type of the input data. - An optional name for this input in log messages. - Collection of linker and compiler options - + + + + + + + + + + - Add an input to a pending linker invocation. + Flags reserved for the future. Must be zero. - Path to the input file. - The type of the input data. - Collection of linker and compiler options - + - Complete a pending linker invocation. - Completes the pending linker action and returns the cubin image for the linked - device code, which can be used with ::cuModuleLoadData. + External semaphore wait parameters - + - A variable located in managed memory. - Type: byte + Parameters for fence objects - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Value of fence to be waited on - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Value of fence to be waited on - The module where the variable is defined in. - The variable name as defined in the cu-file. - + + + + - Creates a new CudaManagedMemory from definition in cu-file. + Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType + is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + - + + + + - Dispose + Parameters for keyed mutex objects - + - For IDisposable + Value of key to acquire the mutex with - - + - UIntPtr to managed memory. + Timeout in milliseconds to wait to acquire the mutex - + + + + + + + - CUdeviceptr to managed memory. + Flags reserved for the future. Must be zero. - + - Size in bytes + Specifies a location for an allocation. - + - Size in elements + Specifies the location type, which modifies the meaning of id. - + - Access array per element. + identifier for a given this location's ::CUmemLocationType. - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Allocation hint for requesting compressible memory. + On devices that support Compute Data Compression, compressible + memory can be used to accelerate accesses to data with unstructured + sparsity and other compressible data patterns.Applications are + expected to query allocation property of the handle obtained with + ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to + validate if the obtained allocation is compressible or not.Note that + compressed memory may not be mappable on all devices. - + + + + + + + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Bitmask indicating intended usage for this allocation - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Specifies the allocation properties for a allocation. - + - The describing the physical location of a pointer + Allocation type - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + requested ::CUmemAllocationHandleType - + - The address at which a pointer's memory may be accessed on the host + Location of allocation - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Windows-specific POBJECT_ATTRIBUTES required when + ::CU_MEM_HANDLE_TYPE_WIN32 is specified.This object attributes structure + includes security attributes that define + the scope of which exported allocations may be transferred to other + processes. In all other cases, this field is required to be zero. - + - Synchronize every synchronous memory operation initiated on this region + allocFlags - + - A process-wide unique ID for an allocated memory region + Memory access descriptor - + - Indicates if the pointer points to managed memory + Location on which the request is to change it's accessibility - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + ::CUmemProt accessibility flags to set on the request - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Specifies an access policy for a window, a contiguous extent of memory + beginning at base_ptr and ending at base_ptr + num_bytes. + num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. + Partition into many segments and assign segments such that: + sum of "hit segments" / window == approx.ratio. + sum of "miss segments" / window == approx 1-ratio. + Segments and ratio specifications are fitted to the capabilities of + the architecture. + Accesses in a hit segment apply the hitProp access policy. + Accesses in a miss segment apply the missProp access policy. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Starting address of the access policy window. CUDA driver may align it. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_byte + hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. - + - + ::CUaccessProperty set for hit. - - + - + ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING - + - + Graph attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute - + - + Attribute ::CUaccessPolicyWindow. - + - + Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). - - + - A variable located in managed memory. - Type: uchar1 + Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Attribute ::CUaccessPolicyWindow. - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Value for ::CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY. - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + CUDA array sparse properties - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + TileExtent - + - Dispose + Width of sparse tile in elements - + - For IDisposable + Height of sparse tile in elements - - + - UIntPtr to managed memory. + Depth of sparse tile in elements - + - CUdeviceptr to managed memory. + TileExtent - + - Size in bytes + First mip level at which the mip tail begins. - + - Size in elements + Total size of the mip tail. - + - Access array per element. + Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Specifies the CUDA array or CUDA mipmapped array memory mapping information - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + resource - + - The describing the physical location of a pointer + resource - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + - + - The address at which a pointer's memory may be accessed on the host + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero - + - Synchronize every synchronous memory operation initiated on this region + For CUDA layered arrays must be a valid layer index. Otherwise, must be zero - + - A process-wide unique ID for an allocated memory region + Starting X offset in elements - + - Indicates if the pointer points to managed memory + Starting Y offset in elements - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Starting Z offset in elements - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + + + Width in elements + + + + + Height in elements + + + + + Depth in elements + + + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + + + + For CUDA layered arrays must be a valid layer index. Otherwise, must be zero + + + + + Offset within mip tail + + + + + Extent in bytes + + + + - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. + + + + - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Resource type - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uchar1 + Sparse subresource type - + - - + - + Memory operation type - + + + Memory handle type + + + - + + + Offset within the memory + + + + + Device ordinal bit mask + + + + + flags for future use, must be zero now. + + + + + Reserved for future use, must be zero now. + + + + + Reserved for future use, must be zero now. + + + + + Semaphore signal node parameters (V1 and V2) + + + + + + + + + + + Semaphore signal node parameters (V1 and V2) + + + + + + + + + + + + + + Semaphore wait node parameters (V1 and V2) + + + + + + + + + + + Semaphore wait node parameters (V1 and V2) + + + + + + + + + + + + - + + + Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED + + + + + Handle types that will be supported by allocations from the pool. + + + + + Location where allocations should reside. + + + + + Windows-specific LPSECURITYATTRIBUTES required when ::CU_MEM_HANDLE_TYPE_WIN32 is specified. + This security attribute defines the scope of which exported allocations may be transferred + to other processes. In all other cases, this field is required to be zero. + + + + + Maximum pool size. When set to 0, defaults to a system dependent value. + + + + + Bitmask indicating intended usage for the pool. + + + - - + - A variable located in managed memory. - Type: uchar2 + Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + The number of SMs the context is limited to use. - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + The number of SMs the context is limited to use. - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + Execution Affinity Parameters - + - Dispose + Memory allocation node parameters (V1 and V2) - + - For IDisposable + in: location where the allocation should reside (specified in ::location). + ::handleTypes must be::CU_MEM_HANDLE_TYPE_NONE.IPC is not supported. - - + - UIntPtr to managed memory. + in: array of memory access descriptors. Used to describe peer GPU access - + - CUdeviceptr to managed memory. + in: size in bytes of the requested allocation - + - Size in bytes + out: address of the allocation returned by CUDA - + - Size in elements + Memory allocation node parameters (V1 and V2) - + - Access array per element. + in: location where the allocation should reside (specified in ::location). + ::handleTypes must be::CU_MEM_HANDLE_TYPE_NONE.IPC is not supported. - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + in: array of memory access descriptors. Used to describe peer GPU access - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + in: number of memory access descriptors. Must not exceed the number of GPUs. - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + in: size in bytes of the requested allocation - + - The describing the physical location of a pointer + out: address of the allocation returned by CUDA - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Memory free node parameters - + - The address at which a pointer's memory may be accessed on the host + in: the pointer to free - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Result information returned by cuGraphExecUpdate - + - Synchronize every synchronous memory operation initiated on this region + Gives more specific detail when a cuda graph update fails. - + - A process-wide unique ID for an allocated memory region + The "to node" of the error edge when the topologies do not match. + The error node when the error is associated with a specific node. + NULL when the error is generic. - + - Indicates if the pointer points to managed memory + The from node of error edge when the topologies do not match. Otherwise NULL. - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Tensor map descriptor. Requires compiler support for aligning to 64 bytes. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + + + + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + CUDA array memory requirements - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Total required memory size - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + alignment requirement - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uchar2 + Graph instantiation parameters - + - + Instantiation flags - - + - + Upload stream - + - + The node which caused instantiation to fail, if any - + + + Whether instantiation was successful. If it failed, the reason why + + + - + - - + - A variable located in managed memory. - Type: uchar3 + - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Cluster dimensions for the kernel node. - The module where the variable is defined in. - The variable name as defined in the cu-file. - + + + + + + + + + + - Creates a new CudaManagedMemory from definition in cu-file. + Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT. - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + Event to fire when all blocks trigger it - + - Dispose + Does not accept ::CU_EVENT_RECORD_EXTERNAL - + - For IDisposable + If this is set to non-0, each block launch will automatically trigger the event - - + - UIntPtr to managed memory. + Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT. - + - CUdeviceptr to managed memory. + Event to fire when the last block launches - + - Size in bytes + Does not accept ::CU_EVENT_RECORD_EXTERNAL - + - Size in elements + Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION + that represents the desired preferred cluster dimensions for the kernel. + Opaque type with the following fields: + - \p x - The X dimension of the preferred cluster, in blocks. Must + be a divisor of the grid X dimension, and must be a + multiple of the \p x field of ::CUlaunchAttributeValue::clusterDim. + - \p y - The Y dimension of the preferred cluster, in blocks. Must + be a divisor of the grid Y dimension, and must be a + multiple of the \p y field of ::CUlaunchAttributeValue::clusterDim. + - \p z - The Z dimension of the preferred cluster, in blocks. Must be + equal to the \p z field of ::CUlaunchAttributeValue::clusterDim. - + - Access array per element. - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. - + - The describing the physical location of a pointer + Whether or not the resulting kernel node should be device-updatable. - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Returns a handle to pass to the various device-side update functions. - + - The address at which a pointer's memory may be accessed on the host + Attribute ::CUaccessPolicyWindow. - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Nonzero indicates a cooperative kernel (see ::cuLaunchCooperativeKernel). - + - Synchronize every synchronous memory operation initiated on this region + ::CUsynchronizationPolicy for work queued up in this stream - + - A process-wide unique ID for an allocated memory region + Cluster dimensions for the kernel node. - + - Indicates if the pointer points to managed memory + Cluster scheduling policy preference for the kernel node. - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Execution priority of the kernel. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uchar3 + - + - - + - + - + - + Value of launch attribute ::CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT - + - + Pad to 64 bytes - - + - A variable located in managed memory. - Type: uchar4 + - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + Width of grid in blocks - + - Dispose + Height of grid in blocks - + - For IDisposable + Depth of grid in blocks - - + - UIntPtr to managed memory. + X dimension of each thread block - + - CUdeviceptr to managed memory. + Y dimension of each thread block - + - Size in bytes + Z dimension of each thread block - + - Size in elements + Dynamic shared-memory size per thread block in bytes - + - Access array per element. + Stream identifier - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + nullable if numAttrs == 0 - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + number of attributes populated in attrs - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + - + - The describing the physical location of a pointer + Width of grid in blocks - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Height of grid in blocks - + - The address at which a pointer's memory may be accessed on the host + Depth of grid in blocks - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + X dimension of each thread block - + - Synchronize every synchronous memory operation initiated on this region + Y dimension of each thread block - + - A process-wide unique ID for an allocated memory region + Z dimension of each thread block - + - Indicates if the pointer points to managed memory + Dynamic shared-memory size per thread block in bytes - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Stream identifier - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + nullable if numAttrs == 0 - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Specifies the properties for a multicast object. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + The number of devices in the multicast team that will bind memory to this object - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uchar4 + The maximum amount of memory that can be bound to this multicast object per device - + - + Bitmask of exportable handle types (see ::CUmemAllocationHandleType) for this object - - + - + Flags for future use, must be zero now - + - + Child graph node parameters - + - + The child graph to clone into the node for node creation, or a handle to the graph owned by the node for node query - + - + Event record node parameters - - + - A variable located in managed memory. - Type: sbyte + The event to record when the node executes - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Event wait node parameters - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + The event to wait on from the node - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + Note that not all fields are public. Private fields must be set using the Set/Get methods that allocate / free additional memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - For dispose + Fills the internal CudaHostNodeParams structure that allocates additional memory. Make sure that the delegate is not garbage collected by pinning it! + - + - Dispose + Fills the internal CudaMemAllocNodeParams structure that allocates additional memory. Each Set call must be followed by a call to Get() in order to free the internally allocated memory! + - + - For IDisposable + Fills the internal CudaMemAllocNodeParams structure that allocates additional memory. Each Set call must be followed by a call to Get() in order to free the internally allocated memory! - + - + - UIntPtr to managed memory. + Fills the internal CudaExtSemSignalNodeParams structure that allocates additional memory. Each Set call must be followed by a call to Get() in order to free the internally allocated memory! + - + - CUdeviceptr to managed memory. + Fills the internal CudaExtSemWaitNodeParams structure that allocates additional memory. Each Set call must be followed by a call to Get() in order to free the internally allocated memory! + - + - Size in bytes + Copies the data from the internal structure to CudaMemAllocNodeParams and frees the internally allocated memory. If Set() hasn't been called before on the output structure, the call might fail. + - + - Size in elements + Copies the data from the internal structure to CudaBatchMemOpNodeParams and frees the internally allocated memory. If Set() hasn't been called before on the output structure, the call might fail. - + - Access array per element. + Copies the data from the internal structure to CudaExtSemSignalNodeParams and frees the internally allocated memory. If Set() hasn't been called before on the output structure, the call might fail. - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copies the data from the internal structure to CudaExtSemWaitNodeParams and frees the internally allocated memory. If Set() hasn't been called before on the output structure, the call might fail. - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Conditional node parameters - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Conditional node handle. + Handles must be created in advance of creating the node + using ::cuGraphConditionalHandleCreate. - + - The describing the physical location of a pointer + Type of conditional node. - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Size of graph output array. Allowed values are 1 for CU_GRAPH_COND_TYPE_WHILE, 1 or 2 + for CU_GRAPH_COND_TYPE_IF, or any value greater than zero for CU_GRAPH_COND_TYPE_SWITCH. - + - The address at which a pointer's memory may be accessed on the host + CUDA-owned array populated with conditional node child graphs during creation of the node. + Valid for the lifetime of the conditional node. + The contents of the graph(s) are subject to the following constraints: + - Allowed node types are kernel nodes, empty nodes, child graphs, memsets, + memcopies, and conditionals. This applies recursively to child graphs and conditional bodies. + - All kernels, including kernels in nested conditionals or child graphs at any level, + must belong to the same CUDA context. + These graphs may be populated using graph node creation APIs or ::cuStreamBeginCaptureToGraph. - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Context on which to run the node. + Must match context used to create the handle and all body nodes. - + - Synchronize every synchronous memory operation initiated on this region + Optional annotation for edges in a CUDA graph. Note, all edges implicitly have annotations and + default to a zero-initialized value if not specified.A zero-initialized struct indicates a + standard full serialization of two nodes with memory visibility. - + - A process-wide unique ID for an allocated memory region + This indicates when the dependency is triggered from the upstream + node on the edge. The meaning is specfic to the node type. A value + of 0 in all cases means full completion of the upstream node, with + memory visibility to the downstream node or portion thereof + (indicated by \c to_port). + Only kernel nodes define non-zero ports. A kernel node + can use the following output port types: + ::CU_GRAPH_KERNEL_NODE_PORT_DEFAULT, ::CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, + or ::CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER. - + - Indicates if the pointer points to managed memory + This indicates what portion of the downstream node is dependent on + the upstream node or portion thereof (indicated by \c from_port). The + meaning is specific to the node type. A value of 0 in all cases means + the entirety of the downstream node is dependent on the upstream work. + Currently no node types define non-zero ports. Accordingly, this field + must be set to zero. - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + This should be populated with a value from ::CUgraphDependencyType. (It + is typed as char due to compiler-specific layout of bitfields.) See + ::CUgraphDependencyType. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + These bytes are unused and must be zeroed. This ensures + compatibility if additional fields are added in the future. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Information passed to the user via the async notification callback - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_sbyte + - + - - + - + Information passed to the user via the async notification callback - + - + - + - + Data for SM-related resources - - + - A variable located in managed memory. - Type: char1 + The amount of streaming multiprocessors available in this resource. This is an output parameter only, do not write to this field. - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + A tagged union describing different resources identified by the type field. This structure should not be directly modified outside of the API that created it. - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Type of resource, dictates which union field was last set - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + Resource corresponding to CU_DEV_RESOURCE_TYPE_SM \p. type. - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + - For dispose + Splits \p CU_DEV_RESOURCE_TYPE_SM resources. + Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount + and the usage flags in \p useFlags.If \p result is NULL, the API simulates a split and provides the amount of groups that + would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return, + the API will overwrite \p nbGroups with the amount actually created.The groups are written to the array in \p result. + \p nbGroups can be less than the total amount if a smaller number of groups is needed. + This API is used to spatially partition the input resource.The input resource needs to come from one of + ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or::cuGreenCtxGetDevResource. + A limitation of the API is that the output results cannot be split again without + first creating a descriptor and a green context with that descriptor. + + When creating the groups, the API will take into account the performance and functional characteristics of the + input resource, and guarantee a split that will create a disjoint set of symmetrical partitions.This may lead to less groups created + than purely dividing the total SM count by the \p minCount due to cluster requirements or + alignment and granularity requirements for the minCount. + + The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result. + Its use should be carefully planned and future partitions of the \p remainder set are discouraged. + + A successful API call must either have: + - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional. + - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. + This queries the number of groups that would be created by the API. + + Note: The API is not supported on 32-bit platforms. - + - Dispose + Splits \p CU_DEV_RESOURCE_TYPE_SM resources. + Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount + and the usage flags in \p useFlags.If \p result is NULL, the API simulates a split and provides the amount of groups that + would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return, + the API will overwrite \p nbGroups with the amount actually created.The groups are written to the array in \p result. + \p nbGroups can be less than the total amount if a smaller number of groups is needed. + This API is used to spatially partition the input resource.The input resource needs to come from one of + ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or::cuGreenCtxGetDevResource. + A limitation of the API is that the output results cannot be split again without + first creating a descriptor and a green context with that descriptor. + + When creating the groups, the API will take into account the performance and functional characteristics of the + input resource, and guarantee a split that will create a disjoint set of symmetrical partitions.This may lead to less groups created + than purely dividing the total SM count by the \p minCount due to cluster requirements or + alignment and granularity requirements for the minCount. + + The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result. + Its use should be carefully planned and future partitions of the \p remainder set are discouraged. + + A successful API call must either have: + - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional. + - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. + This queries the number of groups that would be created by the API. + + Note: The API is not supported on 32-bit platforms. - + - For IDisposable + CIG Context Create Params - - + - UIntPtr to managed memory. + - + - CUdeviceptr to managed memory. + - + - Size in bytes + Params for creating CUDA context. Exactly one of execAffinityParams and cigParams must be non-NULL. - + - Size in elements + - + - Access array per element. + - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + CUDA checkpoint optional lock arguments - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Timeout in milliseconds to attempt to lock the process, 0 indicates no timeout - + - The describing the physical location of a pointer + Reserved for future use, must be zero - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Reserved for future use, must be zeroed - + - The address at which a pointer's memory may be accessed on the host + CUDA checkpoint optional checkpoint arguments - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Reserved for future use, must be zeroed - + - Synchronize every synchronous memory operation initiated on this region + CUDA checkpoint optional restore arguments - + - A process-wide unique ID for an allocated memory region + Reserved for future use, must be zeroed - + - Indicates if the pointer points to managed memory + CUDA checkpoint optional unlock arguments - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Reserved for future use, must be zeroed - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Attributes specific to copies within a batch. For more details on usage see ::cuMemcpyBatchAsync. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Source access ordering to be observed for copies with this attribute. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Hint location for the source operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_char1 + Hint location for the destination operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. - + - + Additional flags for copies with this attribute. See ::CUmemcpyFlags - - + - + Struct representing offset into a CUarray in elements - + + + + + + + + + + - + Struct representing width/height/depth of a CUarray in elements - + + + + + + + + + + - + Struct representing an operand for copy with ::cuMemcpy3DBatchAsync - + + + Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_POINTER + + + - - + - A variable located in managed memory. - Type: char2 + Length of each row in elements. - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Height of each layer in elements. - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Hint location for the operand. Ignored when the pointers are not managed memory or memory allocated outside CUDA. - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + Struct representing an operand when ::CUmemcpy3DOperand::type is ::CU_MEMCPY_OPERAND_TYPE_ARRAY - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + + + + + + + + + + + + + + + + + + + - For dispose + Source memcpy operand. - + - Dispose + Destination memcpy operand. - + - For IDisposable + Extents of the memcpy between src and dst. The width, height and depth components must not be 0. - - + - UIntPtr to managed memory. + Source access ordering to be observed for copy from src to dst. - + - CUdeviceptr to managed memory. + Additional flags for copies with this attribute. See ::CUmemcpyFlags - + - Size in bytes + Structure describing the parameters that compose a single decompression operation. - + - Size in elements + The number of bytes to be read and decompressed from ::CUmemDecompressParams_st.src. - + - Access array per element. + The number of bytes that the decompression operation will be expected to + write to::CUmemDecompressParams_st.dst.This value is optional; if + present, it may be used by the CUDA driver as a heuristic for scheduling + the individual decompression operations. - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + After the decompression operation has completed, the actual number of + bytes written to::CUmemDecompressParams.dst will be recorded as a 32-bit + unsigned integer in the memory at this address. - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Pointer to a buffer of at least ::CUmemDecompressParams_st.srcNumBytes compressed bytes. - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Pointer to a buffer where the decompressed data will be written. The number of bytes + written to this location will be recorded in the memory + pointed to by::CUmemDecompressParams_st.dstActBytes - + - The describing the physical location of a pointer + The decompression algorithm to use. - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + These 20 bytes are unused and must be zeroed. This ensures compatibility if additional fields are added in the future. - + - The address at which a pointer's memory may be accessed on the host + Translates from CudaDataType to .net type and vice versa + + + + + + + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + + + - + - Synchronize every synchronous memory operation initiated on this region + + + - + - A process-wide unique ID for an allocated memory region + + + - + - Indicates if the pointer points to managed memory + An abstraction layer for the CUDA driver API - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Specifies the directX version to use with a cuda context, if necessary - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + DirectX9 - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + DirectX10 - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + DirectX11 - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + + + + + + + + + + + + + - Enumerator class for CudaManagedMemory_char2 + Create a new instace of managed Cuda. Creates a new cuda context. + Using device with ID 0 and - + - + Create a new instace of managed Cuda. + If createNew is true, a new cuda context will be created. + If createNew is false, the CudaContext is bound to an existing cuda context. Creates a new context if no context exists. + Using device with ID 0 and - + - + - + Create a new instace of managed Cuda. Creates a new cuda context. + Using + DeviceID - + - + Create a new instace of managed Cuda. + If createNew is true, a new cuda context will be created. + If createNew is false, the CudaContext bounds to an existing cuda context. Creates a new context if no context exists. + DeviceID + - + - + Create a new instace of managed Cuda. Creates a new cuda context. + DeviceID. + Context creation flags. - + - + Create a new instace of a cuda context from the given CudaStream - + The stream to query - + - A variable located in managed memory. - Type: char3 + Create a new instace of managed Cuda + DeviceID. + Context creation flags. + Create a new CUDA context or use an exiting context for the calling thread. Creates a new context if no context exists. - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Create a new instace of managed Cuda with execution affinity - In elements - + DeviceID. + Context creation flags. + - + - Creates a new CudaManagedMemory from definition in cu-file. + Create a new instace of managed Cuda with execution affinity - The module where the variable is defined in. - The variable name as defined in the cu-file. + DeviceID. + Context creation flags. + Context creation parameters - + - Creates a new CudaManagedMemory from definition in cu-file. + Create a new instance of managed CUDA for a given Direct3DX-device. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Direct3D device + Context creation flags + DirectX Version to bind this context to (9, 10, 11) - + + + Create a new instance of managed CUDA for a given Direct3DX-device. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + Use to obtain a list of possible values for cudaDevice. + + CUdevice to map this context to. Use to obtain a list of possible values + Direct3D device. + Context creation flags + DirectX (9, 10, 11) Version to bind this context to + + + + As the normal context constructor has the same arguments, the OpenGL-constructor is private with inverse arguement order. + It has to be called from a static method. + Create a new instance of managed CUDA for a OpenGL-device. + OpenGL resources from this device may be registered and mapped through the lifetime of this CUDA context. + + CUdevice to map this context to. + Context creation flags + + + + Create a new instace of managed Cuda, performing no CUDA API calls. Needed for inheritance. + + Additional constructor parameter to differentiate direct constructor call or inherited call, i.e. called by primaryContext class. + DeviceID. + + For dispose - + Dispose - + - For IDisposable + For IDisposable. + Note: If this instance created the wrapped CUcontext, it will be destroyed and can't be accessed by other threads anymore. + If this instance only was bound to an existing CUcontext, the wrapped CUcontext won't be destroyed. - + - UIntPtr to managed memory. + Make sure the kernel image arrays are zero terminated by appending a zero - + - CUdeviceptr to managed memory. + Gets the context's API version + Version - + - Size in bytes + Blocks until the device has completed all preceding requested tasks. Throws a if one of the + preceding tasks failed. If the context was created with the flag, the CPU thread will + block until the GPU context has finished its work. - + - Size in elements + Push the CUDA context - + - Access array per element. + Pop the CUDA context - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Binds this CUDA context to the calling CPU thread - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Sets the shared memory configuration for the current context. + On devices with configurable shared memory banks, this function will set + the context's shared memory bank size which is used for subsequent kernel + launches. + Changed the shared memory configuration between launches may insert a device + side synchronization point between those launches. + Changing the shared memory bank size will not increase shared memory usage + or affect occupancy of kernels, but may have major effects on performance. + Larger bank sizes will allow for greater potential bandwidth to shared memory, + but will change what kinds of accesses to shared memory will result in bank + conflicts. + This function will do nothing on devices with fixed shared memory bank size. + + The supported bank configurations are: + - : set bank width to the default initial + setting (currently, four bytes). + - : set shared memory bank width to + be natively four bytes. + - : set shared memory bank width to + be natively eight bytes. - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Returns the current shared memory configuration for the current context. - + - The describing the physical location of a pointer + Load a CUBIN-module from file + + - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Load a PTX module from file + + + + - + - The address at which a pointer's memory may be accessed on the host + Load a PTX module from file + + Collection of linker and compiler options + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Load a PTX module from file + + - + - Synchronize every synchronous memory operation initiated on this region + Load a ptx module from image as byte[] + + Collection of linker and compiler options + - + - A process-wide unique ID for an allocated memory region + Load a ptx module from image as byte[] + + + + - + - Indicates if the pointer points to managed memory + Load a ptx module from image as stream + + + + - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Load a ptx module from image as stream - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of + + Collection of linker and compiler options - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Load a ptx module from image as byte[] - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Load a ptx module from image as stream - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Load a CUBIN-module from file and return directly a wrapped CudaKernel - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Path and name of the module file + The kernel name as defined in the *.cu file + - + - Enumerator class for CudaManagedMemory_char3 + Load a PTX module from file and return directly a wrapped CudaKernel + Path and name of the ptx-module file + The kernel name as defined in the *.cu file + JIT-compile options. Only if module image is a ptx module + JIT-compile options values. Only if module image is a ptx module + - + - + Load a PTX module from file and return directly a wrapped CudaKernel - + Path and name of the ptx-module file + The kernel name as defined in the *.cu file + Collection of linker and compiler options. Only if module image is a ptx module + - + - + Load a PTX module from file and return directly a wrapped CudaKernel + Path and name of the ptx-module file + The kernel name as defined in the *.cu file + - + - + Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + Module image (cubin or PTX) as byte[] + The kernel name as defined in the *.cu file + JIT-compile options. Only if module image is a ptx module + JIT-compilt options values. Only if module image is a ptx module + - + - + Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + Module image (cubin or PTX) as byte[] + The kernel name as defined in the *.cu file + Collection of linker and compiler options. Only if module image is a ptx module + - + - + Load a ptx module from image as stream and return directly a wrapped CudaKernel + + Module image (cubin or PTX) as stream + The kernel name as defined in the *.cu file + JIT-compile options. Only if module image is a ptx module + JIT-compilt options values. Only if module image is a ptx module + + + + + Load a ptx module from image as stream and return directly a wrapped CudaKernel + Module image (cubin or PTX) as stream + The kernel name as defined in the *.cu file + Collection of linker and compiler options. Only if module image is a ptx module - + - A variable located in managed memory. - Type: char4 + Load a ptx module from image as byte[] and return directly a wrapped CudaKernel + Module image (cubin or PTX) as byte[] + The kernel name as defined in the *.cu file + - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Load a ptx module from image as stream and return directly a wrapped CudaKernel - In elements - + Module image (cubin or PTX) as stream + The kernel name as defined in the *.cu file + - + - Creates a new CudaManagedMemory from definition in cu-file. + Load a FatBinary module from image as byte[] - The module where the variable is defined in. - The variable name as defined in the cu-file. + + - + - Creates a new CudaManagedMemory from definition in cu-file. + Load a FatBinary module from image as stream - The kernel which module defines the variable. - The variable name as defined in the cu-file. + + - + - For dispose + Load a FatBinary module from image as byte[] and return directly a wrapped CudaKernel + Module image (fat binary) as byte[] + The kernel name as defined in the *.cu file + - + - Dispose + Load a FatBinary module from image as stream and return directly a wrapped CudaKernel + Module image (fat binary) as stream + The kernel name as defined in the *.cu file + - + - For IDisposable + unload module - + - + - UIntPtr to managed memory. + unload kernel + - + - CUdeviceptr to managed memory. + Allocate memory on the device + + - + - Size in bytes + SetMemory (cuMemsetD8) + + + - + - Size in elements + SetMemory (cuMemsetD16) + + + - + - Access array per element. + SetMemory (cuMemsetD32) - index in elements - + + + - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + SetMemory (cuMemset2DD8) + + + + + - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + SetMemory (cuMemset2DD16) - managed variable - newly allocated host variable with value from managed memory + + + + + - + - The on which a pointer was allocated or registered + SetMemory (cuMemset2DD32) + + + + + + + + + + SetMemory (cuMemsetD8) + + + + - + - The describing the physical location of a pointer + SetMemory (cuMemsetD16) + + + + - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + SetMemory (cuMemsetD32) + + + + - + - The address at which a pointer's memory may be accessed on the host + SetMemory (cuMemset2DD8) + + + + + + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + SetMemory (cuMemset2DD16) + + + + + + - + - Synchronize every synchronous memory operation initiated on this region + SetMemory (cuMemset2DD32) + + + + + + - + - A process-wide unique ID for an allocated memory region + Free device memory + - + - Indicates if the pointer points to managed memory + Free device memory async + + + + - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Returns the total device memory in bytes - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Returns the free available device memory in bytes - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Queries if a device may directly access a peer device's memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + On devices where the L1 cache and shared memory use the same hardware + resources, this returns the preferred cache configuration + for the current context. This is only a preference. The driver will use + the requested configuration if possible, but it is free to choose a different + configuration if required to execute functions. + This will return on devices + where the size of the L1 cache and shared memory are fixed. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + - + - Enumerator class for CudaManagedMemory_char4 + On devices where the L1 cache and shared memory use the same hardware + resources, this sets through cacheConfig the preferred cache configuration for + the current context. This is only a preference. The driver will use + the requested configuration if possible, but it is free to choose a different + configuration if required to execute the function. Any function preference + set via will be preferred over this context-wide + setting. Setting the context-wide cache configuration to + will cause subsequent kernel launches to prefer + to not change the cache configuration unless required to launch the kernel. + This setting does nothing on devices where the size of the L1 cache and + shared memory are fixed. + Launching a kernel with a different preference than the most recent + preference setting may insert a device-side synchronization point. + - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array + Number of bytes to copy - + - + Copy data from host to device memory + T must be of value type, i.e. a struct + Destination CUdeviceptr (Pointer to device memory) + Source pointer to host memory - + - + Copy data from host to device memory + T must be of value type, i.e. a struct + Destination CUdeviceptr (Pointer to device memory) + Source pointer to host memory - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - A variable located in managed memory. - Type: short + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from host to device memory - In elements - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source array - + - For dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - For IDisposable + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - UIntPtr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - CUdeviceptr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Size in bytes + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Size in elements + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Access array per element. + Copy data from host to device memory - index in elements - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from host to device memory - managed variable - newly allocated host variable with value from managed memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The on which a pointer was allocated or registered + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The describing the physical location of a pointer + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The address at which a pointer's memory may be accessed on the host + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - A process-wide unique ID for an allocated memory region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Indicates if the pointer points to managed memory + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from host to device memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from host to device memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Enumerator class for CudaManagedMemory_short + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - A variable located in managed memory. - Type: short1 + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from host to device memory - In elements - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source array - + - For dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - For IDisposable + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - UIntPtr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - CUdeviceptr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Size in bytes + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Size in elements + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Access array per element. + Copy data from host to device memory - index in elements - + Destination CUdeviceptr (Pointer to device memory) + Source array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from host to device memory - managed variable - newly allocated host variable with value from managed memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The on which a pointer was allocated or registered + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The describing the physical location of a pointer + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - The address at which a pointer's memory may be accessed on the host + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - A process-wide unique ID for an allocated memory region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Indicates if the pointer points to managed memory + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from host to device memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from host to device memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Enumerator class for CudaManagedMemory_short1 + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - A variable located in managed memory. - Type: short2 + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from host to device memory - In elements - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - For dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - For IDisposable + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - UIntPtr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - CUdeviceptr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Size in bytes + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Size in elements + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Access array per element. + Copy data from host to device memory - index in elements - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from host to device memory - managed variable - newly allocated host variable with value from managed memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - The on which a pointer was allocated or registered + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - The describing the physical location of a pointer + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - The address at which a pointer's memory may be accessed on the host + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Synchronize every synchronous memory operation initiated on this region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - A process-wide unique ID for an allocated memory region + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Indicates if the pointer points to managed memory + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from host to device memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from host to device memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to device memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Enumerator class for CudaManagedMemory_short2 + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - A variable located in managed memory. - Type: short3 + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from host to device memory - In elements - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to device memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination CUdeviceptr (Pointer to device memory) + Source value - + - For dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - Dispose + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - For IDisposable + Copy data from host to device memory - + Destination CUdeviceptr (Pointer to device memory) + Source value - + - UIntPtr to managed memory. + Copy data from host to device memory + Destination CUdeviceptr (Pointer to device memory) + Source value - + - CUdeviceptr to managed memory. + Copy data from device to host memory + T must be of value type, i.e. a struct + Destination data in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in bytes + Copy data from device to host memory + T must be of value type, i.e. a struct + Destination data in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in elements + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Access array per element. + Copy data from device to host memory - index in elements - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from device to host memory - managed variable - newly allocated host variable with value from managed memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The on which a pointer was allocated or registered + Copy data from device to host memory + Destination pointer to host memory + Source CUdeviceptr (Pointer to device memory) + Number of bytes to copy - + - The describing the physical location of a pointer + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the host + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Synchronize every synchronous memory operation initiated on this region + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A process-wide unique ID for an allocated memory region + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Indicates if the pointer points to managed memory + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from device to host memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from device to host memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Enumerator class for CudaManagedMemory_short3 + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A variable located in managed memory. - Type: short4 + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from device to host memory - In elements - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - For dispose + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Dispose + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - For IDisposable + Copy data from device to host memory - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - UIntPtr to managed memory. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - CUdeviceptr to managed memory. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in bytes + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in elements + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Access array per element. + Copy data from device to host memory - index in elements - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from device to host memory - managed variable - newly allocated host variable with value from managed memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The on which a pointer was allocated or registered + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The describing the physical location of a pointer + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the host + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Synchronize every synchronous memory operation initiated on this region + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A process-wide unique ID for an allocated memory region + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Indicates if the pointer points to managed memory + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from device to host memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from device to host memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - Enumerator class for CudaManagedMemory_short4 + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination array in host memory + Source CUdeviceptr (Pointer to device memory) - + - A variable located in managed memory. - Type: ushort + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from device to host memory - In elements - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - For dispose + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Dispose + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - For IDisposable + Copy data from device to host memory - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - UIntPtr to managed memory. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - CUdeviceptr to managed memory. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in bytes + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in elements + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Access array per element. + Copy data from device to host memory - index in elements - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from device to host memory - managed variable - newly allocated host variable with value from managed memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The on which a pointer was allocated or registered + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The describing the physical location of a pointer + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the host + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Synchronize every synchronous memory operation initiated on this region + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - A process-wide unique ID for an allocated memory region + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Indicates if the pointer points to managed memory + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from device to host memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from device to host memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to host memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Enumerator class for CudaManagedMemory_ushort + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - + Copy data from device to host memory - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - A variable located in managed memory. - Type: ushort1 + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from device to host memory - In elements - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from device to host memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - For dispose + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Dispose + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - For IDisposable + Copy data from device to host memory - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - UIntPtr to managed memory. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - CUdeviceptr to managed memory. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in bytes + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Size in elements + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Access array per element. + Copy data from device to host memory - index in elements - + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from device to host memory - managed variable - newly allocated host variable with value from managed memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The on which a pointer was allocated or registered + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The describing the physical location of a pointer + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - The address at which a pointer's memory may be accessed on the host + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Synchronize every synchronous memory operation initiated on this region + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - A process-wide unique ID for an allocated memory region + Copy data from device to host memory + Destination value in host memory + Source CUdeviceptr (Pointer to device memory) - + - Indicates if the pointer points to managed memory + Returns the device name of the device bound to the actual context + Device Name - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Returns the device's compute capability of the device bound to the actual context - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Device compute capability - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Retrieve device properties - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + DeviceProperties - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Returns numerical values that correspond to the least and greatest stream priorities. + Returns in leastPriority and greatestPriority the numerical values that correspond + to the least and greatest stream priorities respectively. Stream priorities + follow a convention where lower numbers imply greater priorities. The range of + meaningful stream priorities is given by [greatestPriority, leastPriority]. + If the user attempts to create a stream with a priority value that is + outside the meaningful range as specified by this API, the priority is + automatically clamped down or up to either leastPriority or greatestPriority + respectively. See ::cuStreamCreateWithPriority for details on creating a + priority stream. + A NULL may be passed in for leastPriority or greatestPriority if the value + is not desired. + This function will return '0' in both leastPriority and greatestPriority if + the current context's device does not support stream priorities + (see ::cuDeviceGetAttribute). - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Pointer to an int in which the numerical value for least + stream priority is returned + Pointer to an int in which the numerical value for greatest stream priority is returned - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Returns the current size of limit. See - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Limit to query + Returned size in bytes of limit - + - Enumerator class for CudaManagedMemory_ushort1 + Setting limit to value is a request by the application to update the current limit maintained by the context. The + driver is free to modify the requested value to meet h/w requirements (this could be clamping to minimum or maximum + values, rounding up to nearest element size, etc). The application can use to find out exactly what + the limit has been set to. + Setting each has its own specific restrictions, so each is discussed here: + + ValueRestriction + + controls the stack size of each GPU thread. This limit is only applicable to devices + of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less than 2.0 + will result in the error being returned. + + + controls the size of the FIFO used by the printf() device system call. Setting + must be performed before loading any module that uses the printf() device + system call, otherwise will be returned. This limit is only applicable to + devices of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less + than 2.0 will result in the error being returned. + + + controls the size in bytes of the heap used by the ::malloc() and ::free() device system calls. Setting + must be performed before launching any kernel that uses the ::malloc() or ::free() device system calls, otherwise + will be returned. This limit is only applicable to + devices of compute capability 2.0 and higher. Attempting to set this limit on devices of compute capability less + than 2.0 will result in the error being returned. + + + controls the maximum nesting depth of a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + this limit must be performed before any launch of a kernel that uses the + device runtime and calls ::cudaDeviceSynchronize() above the default sync + depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + with error code ::cudaErrorSyncDepthExceeded if the limitation is + violated. This limit can be set smaller than the default or up the maximum + launch depth of 24. When setting this limit, keep in mind that additional + levels of sync depth require the driver to reserve large amounts of device + memory which can no longer be used for user allocations. If these + reservations of device memory fail, ::cuCtxSetLimit will return + , and the limit can be reset to a lower value. + This limit is only applicable to devices of compute capability 3.5 and + higher. Attempting to set this limit on devices of compute capability less + than 3.5 will result in the error being + returned. + + + controls the maximum number of + outstanding device runtime launches that can be made from the current + context. A grid is outstanding from the point of launch up until the grid + is known to have been completed. Device runtime launches which violate + this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + ::cudaGetLastError() is called after launch. If more pending launches than + the default (2048 launches) are needed for a module using the device + runtime, this limit can be increased. Keep in mind that being able to + sustain additional pending launches will require the driver to reserve + larger amounts of device memory upfront which can no longer be used for + allocations. If these reservations fail, ::cuCtxSetLimit will return + , and the limit can be reset to a lower value. + This limit is only applicable to devices of compute capability 3.5 and + higher. Attempting to set this limit on devices of compute capability less + than 3.5 will result in the error being + returned. + + + Limit to set + Size in bytes of limit - + - + Registers a callback function to receive async notifications + Registers \p callbackFunc to receive async notifications. + The \p userData parameter is passed to the callback function at async notification time. + Likewise, \p callback is also passed to the callback function to distinguish between + multiple registered callbacks. + The callback function being registered should be designed to return quickly (~10ms). + Any long running tasks should be queued for execution on an application thread. + Callbacks may not call cuDeviceRegisterAsyncNotification or cuDeviceUnregisterAsyncNotification. + Doing so will result in ::CUDA_ERROR_NOT_PERMITTED.Async notification callbacks execute + in an undefined order and may be serialized. + Returns in \p* callback a handle representing the registered callback instance. - + The function to register as a callback + A generic pointer to user data. This is passed into the callback function. - + - + Unregisters an async notification callback + Unregisters \p callback so that the corresponding callback function will stop receiving + async notifications. + The callback instance to unregister from receiving async notifications. - + - + Records an event. + Captures in \p hEvent all the activities of the context \p hCtx + at the time of this call. \p hEvent and \p hCtx must be from the same + CUDA context, otherwise::CUDA_ERROR_INVALID_HANDLE will be returned. + Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine + or wait for completion of the work that was captured. + Uses of \p hCtx after this call do not modify \p hEvent. + If the context passed to \p hCtx is the primary context, \p hEvent will + capture all the activities of the primary context and its green contexts. + If the context passed to \p hCtx is a context converted from green context + via::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context. + + \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the + specified context \p hCtx has a stream in the capture mode.In such a case, + the call will invalidate all the conflicting captures. + Event to record. - + - + Make a context wait on an event + Makes all future work submitted to context \p hCtx wait for all work + captured in \p hEvent.The synchronization will be performed on the device + and will not block the calling CPU thread.See ::cuCtxRecordEvent() + for details on what is captured by an event. + If the context passed to \p hCtx is the primary context, the primary context + and its green contexts will wait for \p hEvent. + If the context passed to \p hCtx is a context converted from green context + via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent. + + \note \p hEvent may be from a different context or device than \p hCtx. + + \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and + invalidate the capture if the specified event \p hEvent is part of an ongoing + capture sequence or if the specified context \p hCtx has a stream in the capture mode. + Event to record. - + - + As the normal context constructor has the same arguments, the OpenGL-constructor is private with inverse arguement order. + It has to be called from a static method. + Create a new instance of managed CUDA for a OpenGL-device. + OpenGL resources from this device may be registered and mapped through the lifetime of this CUDA context. - + CUdevice to map this context to. + Context creation flags - + - A variable located in managed memory. - Type: ushort2 + Gets the CUDA devices associated with the current OpenGL context + SLI parameter + - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Returns a list of possible CUDA devices to use for a given DirectX device - In elements - + DirectX device + SLI parameter + DirectX version of the directX device + - + - Creates a new CudaManagedMemory from definition in cu-file. + Returns the Direct3D device against which the CUDA context, bound to the calling thread, + was created. - The module where the variable is defined in. - The variable name as defined in the cu-file. + + - + - Creates a new CudaManagedMemory from definition in cu-file. + Returns the device name of the device with ID deviceID - The kernel which module defines the variable. - The variable name as defined in the cu-file. + + Device Name - + - For dispose + Returns the device's compute capability of the device with ID deviceID + + Device compute capability - + - Dispose + Returns the version number of the installed cuda driver + CUDA driver version - + - For IDisposable + Retrieve device properties - + Device ID + DeviceProperties - + - UIntPtr to managed memory. + Get the number of CUDA capable devices + - + - CUdeviceptr to managed memory. + If both the current context (current to the calling thread) and peerContext are on devices which support unified + addressing (as may be queried using GetDeviceInfo), then + on success all allocations from peerContext will immediately be accessible + by the current context. See \ref CUDA_UNIFIED for additional + details. + Note that access granted by this call is unidirectional and that in order to access + memory from the current context in peerContext, a separate symmetric call + to ::cuCtxEnablePeerAccess() is required. + Returns if indicates + that the CUdevice of the current context cannot directly access memory + from the CUdevice of peerContext. + Throws if direct access of + peerContext from the current context has already been enabled. + Throws if there is no current context, peerContext + is not a valid context, or if the current context is peerContext. + Peer context to enable direct access to from the current context + - + - Size in bytes + Disables direct access to memory allocations in a peer context and unregisters any registered allocations. + Peer context to disable direct access to + - + - Size in elements + Fills the CudaDeviceProperties structure - + - Access array per element. + Gets the CUdevice for a given device ordinal number - index in elements + - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Initialize the profiling. + Using this API user can initialize the CUDA profiler by specifying + the configuration file, output file and output file format. This + API is generally used to profile different set of counters by + looping the kernel launch. The configFile parameter can be used + to select profiling options including profiler counters. Refer to + the "Compute Command Line Profiler User Guide" for supported + profiler options and counters. + Limitation: The CUDA profiler cannot be initialized with this API + if another profiling tool is already active, as indicated by the + exception . + Name of the config file that lists the counters/options for profiling. + Name of the outputFile where the profiling results will be stored. + outputMode - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Enable profiling. + Enables profile collection by the active profiling tool for the + current context. If profiling is already enabled, then + cuProfilerStart() has no effect. + cuProfilerStart and cuProfilerStop APIs are used to + programmatically control the profiling granularity by allowing + profiling to be done only on selective pieces of code. - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Disables profile collection by the active profiling tool for the + current context. If profiling is already disabled, then + cuProfilerStop() has no effect. + cuProfilerStart and cuProfilerStop APIs are used to + programmatically control the profiling granularity by allowing + profiling to be done only on selective pieces of code. - + - The describing the physical location of a pointer + Resets all persisting lines in cache to normal status. + CtxResetPersistingL2Cache Resets all persisting lines in cache to normal + status. Takes effect on function return. - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Returns the execution affinity setting for the current context. - + - The address at which a pointer's memory may be accessed on the host + Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. + Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture + for given \p format and \p numChannels. + Texture format. + Number of channels per texture element. + Returned maximum number of texture elements allocatable for given \p format and \p numChannels. - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Returns the flags for the current context. - + - Synchronize every synchronous memory operation initiated on this region + Sets the flags for the current context. + Flags to set on the current context - + - A process-wide unique ID for an allocated memory region + Gets the Cuda context bound to this managed Cuda object - + - Indicates if the pointer points to managed memory + Gets the Cuda device allocated to the Cuda Context - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Gets the Id of the Cuda device. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Indicates if the CudaContext instance created the wrapped cuda context (return = true) or if the CudaContext instance was bound to an existing cuda context. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Returns the unique Id associated with the context supplied + The Id is unique for the life of the program for this instance of CUDA. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Number of channels in array - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_ushort2 + One channel, e.g. float1, int1, float, int - + - + Two channels, e.g. float2, int2 - - + - + Four channels, e.g. float4, int4 - + - + An one dimensional CUDA array - + - + Creates a new CUDA array. + + + - + - + Creates a new CUDA array from an existing CUarray. + The CUarray won't be destroyed when disposing. + Array properties are obtained by cuArrayGetDescriptor - + - + - A variable located in managed memory. - Type: ushort3 + Creates a new CUDA array from an existing CUarray. + Array properties are obtained by cuArrayGetDescriptor + + The cuArray will be destroyed while disposing, if the CudaArray is the owner - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + For dispose - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Dispose - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + For IDisposable - The kernel which module defines the variable. - The variable name as defined in the cu-file. + - + - For dispose + Copy data from host to array memory + T must be of value type, i.e. a struct + source pointer to host memory + Offset in bytes of destination array - + - Dispose + Copy data from host to array memory + T must be of value type, i.e. a struct + source pointer to host memory + Offset in bytes of destination array - + - For IDisposable + Copy data from host to array memory - + Pointer to source data + Number of bytes to copy + Offset in bytes of destination array - + - UIntPtr to managed memory. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - CUdeviceptr to managed memory. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Size in bytes + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Size in elements + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Access array per element. + Copy data from host to array memory - index in elements - + Offset in bytes of destination array + source array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from host to array memory - managed variable - newly allocated host variable with value from managed memory + Offset in bytes of destination array + source array - + - The on which a pointer was allocated or registered + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The describing the physical location of a pointer + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The address at which a pointer's memory may be accessed on the host + Copy data from host to array memory + Offset in bytes of destination array + source array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from host to array memory + Offset in bytes of destination array + source array - + - A process-wide unique ID for an allocated memory region + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Indicates if the pointer points to managed memory + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from host to array memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Offset in bytes of destination array + source array - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from host to array memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Offset in bytes of destination array + source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to array memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to array memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + source array - + - Enumerator class for CudaManagedMemory_ushort3 + Copy data from host to array memory + Offset in bytes of destination array + source array - + - + Copy data from host to array memory - + Offset in bytes of destination array + source array - + - + Copy data from host to array memory + Offset in bytes of destination array + source array - + - + Copy data from host to array memory + Offset in bytes of destination array + source array - + - + Copy data from host to array memory + Offset in bytes of destination array + source array - + - + Copy data from host to array memory - + Offset in bytes of destination array + source array - + - A variable located in managed memory. - Type: ushort4 + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from host to array memory - In elements - + Offset in bytes of destination array + source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to array memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Offset in bytes of destination array + source array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from host to array memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Offset in bytes of destination array + source array - + - For dispose + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Dispose + Copy data from host to array memory + Offset in bytes of destination array + source array - + - For IDisposable + Copy data from host to array memory - + Offset in bytes of destination array + source array - + - UIntPtr to managed memory. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - CUdeviceptr to managed memory. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Size in bytes + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Size in elements + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Access array per element. + Copy data from host to array memory - index in elements - + Offset in bytes of destination array + source array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from host to array memory - managed variable - newly allocated host variable with value from managed memory + Offset in bytes of destination array + source array - + - The on which a pointer was allocated or registered + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The describing the physical location of a pointer + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from host to array memory + Offset in bytes of destination array + source array - + - The address at which a pointer's memory may be accessed on the host + Copy data from host to array memory + Offset in bytes of destination array + source array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from host to array memory + Offset in bytes of destination array + source array - + - A process-wide unique ID for an allocated memory region + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Indicates if the pointer points to managed memory + Copy data from host to array memory + Offset in bytes of destination array + source array - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from host to array memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Offset in bytes of destination array + source array - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from host to array memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Offset in bytes of destination array + source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to array memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from host to array memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + source array - + - Enumerator class for CudaManagedMemory_ushort4 + Copy data from host to array memory + Offset in bytes of destination array + source array - + - + Copy data from host to array memory - + Offset in bytes of destination array + source array - + - + Copy data from array to host memory + T must be of value type, i.e. a struct + Destination pointer to host memory + Offset in bytes of destination array - + - + Copy data from array to host memory + T must be of value type, i.e. a struct + Destination pointer to host memory + Offset in bytes of destination array - + - + Copy data from array to host memory + Pointer to Destination data + Number of bytes to copy + Offset in bytes of destination array - + - + Copy data from array to host memory - + Offset in bytes of destination array + Destination array - + - A variable located in managed memory. - Type: int + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from array to host memory - In elements - + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from array to host memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from array to host memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Offset in bytes of destination array + Destination array - + - For dispose + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Dispose + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - For IDisposable + Copy data from array to host memory - + Offset in bytes of destination array + Destination array - + - UIntPtr to managed memory. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - CUdeviceptr to managed memory. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Size in bytes + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Size in elements + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Access array per element. + Copy data from array to host memory - index in elements - + Offset in bytes of destination array + Destination array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from array to host memory - managed variable - newly allocated host variable with value from managed memory + Offset in bytes of destination array + Destination array - + - The on which a pointer was allocated or registered + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The describing the physical location of a pointer + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The address at which a pointer's memory may be accessed on the host + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - A process-wide unique ID for an allocated memory region + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Indicates if the pointer points to managed memory + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from array to host memory - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Offset in bytes of destination array + Destination array - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from array to host memory - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Offset in bytes of destination array + Destination array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from array to host memory - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + Destination array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from array to host memory - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Offset in bytes of destination array + Destination array - + - Enumerator class for CudaManagedMemory_int + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - + Copy data from array to host memory - + Offset in bytes of destination array + Destination array - + - + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - + Copy data from array to host memory - + Offset in bytes of destination array + Destination array - + - A variable located in managed memory. - Type: int1 + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Copy data from array to host memory - In elements - + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from array to host memory - The module where the variable is defined in. - The variable name as defined in the cu-file. + Offset in bytes of destination array + Destination array - + - Creates a new CudaManagedMemory from definition in cu-file. + Copy data from array to host memory - The kernel which module defines the variable. - The variable name as defined in the cu-file. + Offset in bytes of destination array + Destination array - + - For dispose + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Dispose + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - For IDisposable + Copy data from array to host memory - + Offset in bytes of destination array + Destination array - + - UIntPtr to managed memory. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - CUdeviceptr to managed memory. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Size in bytes + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Size in elements + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Access array per element. + Copy data from array to host memory - index in elements - + Offset in bytes of destination array + Destination array - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy data from array to host memory - managed variable - newly allocated host variable with value from managed memory + Offset in bytes of destination array + Destination array - + - The on which a pointer was allocated or registered + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The describing the physical location of a pointer + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - The address at which a pointer's memory may be accessed on the host + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - Synchronize every synchronous memory operation initiated on this region + Copy data from array to host memory + Offset in bytes of destination array + Destination array - + - A process-wide unique ID for an allocated memory region + Copy data from array to array + Destination array + source array + Size of memory copy in bytes + Offset in bytes of destination array + Offset in bytes of source array - + - Indicates if the pointer points to managed memory + Copy data from array to array + Destination array + Size of memory copy in bytes + Offset in bytes of destination array + Offset in bytes of source array - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Copy data from array to array - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Destination array + Size of memory copy in bytes + Offset in bytes of destination array + Offset in bytes of source array - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Copy data from array to device - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + DevicePointer to copy data to + number of bytes to copy + Offset in bytes of source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Copy data from device to array - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + DevicePointer to copy data from + number of bytes to copy + Offset in bytes of source array - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Returns the memory requirements of a CUDA array - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_int1 + Returns the array width in elements - + - + Returns the array width in bytes - - + - + Returns the wrapped CUarray - + - + Returns the wrapped CUDAArrayDescriptor - + - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - + Number of channels in array - - + - A variable located in managed memory. - Type: int2 + One channel, e.g. float1, int1, float, int - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Two channels, e.g. float2, int2 - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Four channels, e.g. float4, int4 - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + A two dimensional CUDA array - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + + + Creates a new CUDA array. + + + In elements + In elements + + + + + Creates a new CUDA array from an existing CUarray. + The CUarray won't be destroyed when disposing. + Array properties are obtained by cuArrayGetDescriptor + + + + + + Creates a new CUDA array from an existing CUarray. + Array properties are obtained by cuArrayGetDescriptor + + + The cuArray will be destroyed while disposing if the CudaArray is the owner + + For dispose - + Dispose - + For IDisposable - + - UIntPtr to managed memory. + A raw data copy method + 2D copy paramters - + - CUdeviceptr to managed memory. + A raw unaligned copy method + - + - Size in bytes + Copy from Host to this array + Source + - + - Size in elements + Copy data from this array to host + IntPtr to destination in host memory + - + - Access array per element. + Copy from Host to this array - index in elements - + Host array base type + Source - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy data from this array to host + Host array base type + Destination - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy from a pitched device variable to this array - managed variable - newly allocated host variable with value from managed memory + device variable base type + Source - + - The on which a pointer was allocated or registered + Copy from this array to a pitched device variable + device variable base type + Destination - + - The describing the physical location of a pointer + Copy array to array + - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Copy array to array + - + - The address at which a pointer's memory may be accessed on the host + Returns the memory requirements of a CUDA array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Returns the wrapped CUarray - + - Synchronize every synchronous memory operation initiated on this region + Returns the wrapped CUDAArrayDescriptor - + - A process-wide unique ID for an allocated memory region + Returns the Height of the array - + - Indicates if the pointer points to managed memory + Returns the array width in elements - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Returns the array width in bytes - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Number of channels in array - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + One channel, e.g. float1, int1, float, int - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_int2 + Two channels, e.g. float2, int2 - + - + Four channels, e.g. float4, int4 - - + - + A three dimensional CUDA array - + - + Creates a new CUDA array. + + In elements + In elements + In elements + + - + - + Creates a new CUDA array from an existing CUarray. + The CUarray won't be destroyed when disposing. + Array properties are obtained by cuArrayGetDescriptor + - + - + Creates a new CUDA array from an existing CUarray. + Array properties are obtained by cuArrayGetDescriptor - + + The cuArray will be destroyed while disposing, if the CudaArray is the owner - + - A variable located in managed memory. - Type: int3 + For dispose - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Dispose - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + For IDisposable - The module where the variable is defined in. - The variable name as defined in the cu-file. + - + - Creates a new CudaManagedMemory from definition in cu-file. + A raw data copy method - The kernel which module defines the variable. - The variable name as defined in the cu-file. + 3D copy paramters - + - For dispose + Copy from Host to this array + Source + - + - Dispose + Copy data from this array to host + IntPtr to destination in host memory + - + - For IDisposable + Copy from Host to this array - + Host array base type + Source - + - UIntPtr to managed memory. + Copy data from this array to host + Host array base type + Destination - + - CUdeviceptr to managed memory. + Copy from a pitched device variable to this array + Source + - + - Size in bytes + Copy from a pitched device variable to this array + Source + + Pitch in bytes - + - Size in elements + Copy from this array to a pitched device variable + Destination + - + - Access array per element. + Copy from this array to a pitched device variable - index in elements - + Destination + + Pitch in bytes - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Copy array to array + - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Copy array to array - managed variable - newly allocated host variable with value from managed memory + - + + /// + Returns the layout properties of a sparse CUDA array + Returns the layout properties of a sparse CUDA array in \p sparseProperties + If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE ::CUDA_ERROR_INVALID_VALUE will be returned. + If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + then::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array.Otherwise, it will be zero. + Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. + Note that the \p array must have been allocated using ::cuArrayCreate or::cuArray3DCreate.For CUDA arrays obtained + using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.Instead, ::cuMipmappedArrayGetSparseProperties + must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. + + + + + Gets a CUDA array plane from a CUDA array + Returns a CUDA array that represents a single format plane + of the CUDA array \p hArray. + If planeIdx is greater than the maximum number of planes in this array or if the array does + not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then::CUDA_ERROR_INVALID_VALUE is returned. + Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns + a CUDA array of the same size as \p hArray but with one channel and::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + + + - The on which a pointer was allocated or registered + Returns the memory requirements of a CUDA array - + - The describing the physical location of a pointer + Returns the wrapped CUarray - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Returns the wrapped CUDAArray3DDescriptor - + - The address at which a pointer's memory may be accessed on the host + Returns the Depth of the array - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Returns the Height of the array - + - Synchronize every synchronous memory operation initiated on this region + Returns the array width in elements - + - A process-wide unique ID for an allocated memory region + Returns the array width in bytes - + - Indicates if the pointer points to managed memory + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Wrapps a CUevent handle. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Creates a new Event using - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Creates a new Event using - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Creates a new Event - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Parameters for event creation - + - Enumerator class for CudaManagedMemory_int3 + For dispose - + - + Dispose - - + - + For IDisposable + - + - + returns the wrapped CUevent handle - + - + Records an event. If stream is non-zero, the event is recorded after all preceding operations in the stream have been + completed; otherwise, it is recorded after all preceding operations in the CUDA context have been completed. Since + operation is asynchronous, and/or must be used to determine when the event + has actually been recorded. + If has previously been called and the event has not been recorded yet, this function throws + . - + - + Records an event. If stream is non-zero, the event is recorded after all preceding operations in the stream have been + completed; otherwise, it is recorded after all preceding operations in the CUDA context have been completed. Since + operation is asynchronous, and/or must be used to determine when the event + has actually been recorded. + If has previously been called and the event has not been recorded yet, this function throws + . + + + + + + Records an event + Captures in \p hEvent the contents of \p hStream at the time of this call. + \p hEvent and \p hStream must be from the same context. + Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + examine or wait for completion of the work that was captured.Uses of + \p hStream after this call do not modify \p hEvent. See note on default + stream behavior for what is captured in the default case. + ::cuEventRecordWithFlags() can be called multiple times on the same event and + will overwrite the previously captured state.Other APIs such as + ::cuStreamWaitEvent() use the most recently captured state at the time + of the API call, and are not affected by later calls to + ::cuEventRecordWithFlags(). Before the first call to::cuEventRecordWithFlags(), an + event represents an empty set of work, so for example::cuEventQuery() + would return ::CUDA_SUCCESS. + + + + + Waits until the event has actually been recorded. If has been called on this event, the function returns + . Waiting for an event that was created with the + flag will cause the calling CPU thread to block until the event has actually been recorded. + If has previously been called and the event has not been recorded yet, this function throws . + + + + + Returns true if the event has actually been recorded, or false if not. If + has not been called on this event, the function throws . - + - A variable located in managed memory. - Type: int4 + Computes the elapsed time between two events (in milliseconds with a resolution of around 0.5 microseconds). If + either event has not been recorded yet, this function throws . If either event has been + recorded with a non-zero stream, the result is undefined. + + + - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Represents a Cuda graph. On dispose() all graph nodes will be distroyed, too! - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Creates a new CudaGraph - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + For clone graph method - The kernel which module defines the variable. - The variable name as defined in the cu-file. - + For dispose - + Dispose - + For IDisposable - + - UIntPtr to managed memory. + Creates an empty node and adds it to a graph + Creates a new node which performs no operation, and adds it to to the graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + + An empty node performs no operation during execution, but can be used for + transitive ordering. For example, a phased execution graph with 2 groups of n + nodes with a barrier between them can be represented using an empty node and + 2*n dependency edges, rather than no empty node and n^2 dependency edges. + can be null + A handle to the new node will be returned. - + - CUdeviceptr to managed memory. + Creates a memset node and adds it to a graph + Creates a new memset node and adds it to graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + The element size must be 1, 2, or 4 bytes. + When the graph is launched, the node will perform the memset described by memsetParams. + can be null + When the graph is launched, the node will perform the memset described by memsetParams. + Cuda context used for the operation + A handle to the new node will be returned. - + - Size in bytes + Creates a memset node and adds it to a graph + Creates a new memset node and adds it to graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + The element size must be 1, 2, or 4 bytes. + When the graph is launched, the node will perform the memset described by memsetParams. + can be null + When the graph is launched, the node will perform the memset on deviceVariable. + Value to set + Cuda context used for the operation + A handle to the new node will be returned. - + - Size in elements + Creates a memset node and adds it to a graph + Creates a new memset node and adds it to graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + The element size must be 1, 2, or 4 bytes. + When the graph is launched, the node will perform the memset described by memsetParams. + can be null + When the graph is launched, the node will perform the memset on deviceVariable. + Value to set + Cuda context used for the operation + A handle to the new node will be returned. - + - Access array per element. + Creates a memcpy node and adds it to a graph + Creates a new memcpy node and adds it to graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + A handle to the new node will be returned. + When the graph is launched, the node will perform the memcpy described by copyParams. + See ::cuMemcpy3D() for a description of the structure and its restrictions. + Memcpy nodes have some additional restrictions with regards to managed memory, if the + system contains at least one device which has a zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer + to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed + for those operand(s). The managed memory will be treated as residing on either the + host or the device, depending on which memory type is specified. - index in elements - + can be null + Parameters for the memory copy + Cuda context used for the operation + A handle to the new node will be returned. - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Creates a kernel execution node and adds it to a graph + Creates a new kernel execution node and adds it to the graph with + dependencies specified via dependencies and arguments specified in nodeParams. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + A handle to the new node will be returned. + can be null + Parameters for the GPU execution node + A handle to the new node will be returned. - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Creates a kernel execution node and adds it to a graph + Creates a new kernel execution node and adds it to the graph with + dependencies specified via dependencies and arguments specified in nodeParams. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + A handle to the new node will be returned. - managed variable - newly allocated host variable with value from managed memory + can be null + Kernel to execute + Kernel parameters to pass. An Array of IntPtr each of them pointing to a parameters. Note that the parameters must be pinned by GC! + Extra data + A handle to the new node will be returned. - + - The on which a pointer was allocated or registered + Creates a child graph node and adds it to a graph + Creates a new node which executes an embedded graph, and adds it to this Graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + A handle to the new node will be returned. + The node executes an embedded child graph. The child graph is cloned in this call. + can be null + + A handle to the new node will be returned. - + - The describing the physical location of a pointer + Creates a host execution node and adds it to a graph + Creates a new CPU execution node and adds it to the graph with + dependencies specified via dependencies. + It is possible for dependencies to be null, in which case the node will be placed + at the root of the graph. Dependencies may not have any duplicate entries. + A handle to the new node will be returned. + When the graph is launched, the node will invoke the specified CPU function. + can be null + Host function to execute + User data for host function. Note that the data object must be pinned by GC! + A handle to the new node will be returned. - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Creates an event record node and adds it to a graph + Creates a new event record node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p params. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. + A handle to the new node will be returned in \p phGraphNode. + Each launch of the graph will record \p event to capture execution of the + node's dependencies. + Dependencies of the node + Event for the node + Returns newly created node - + - The address at which a pointer's memory may be accessed on the host + Creates an event wait node and adds it to a graph + Creates a new event wait node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p params. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. + A handle to the new node will be returned in \p phGraphNode. + The graph node will wait for all work captured in \p event. See ::cuEventRecord() + for details on what is captured by an event. \p event may be from a different context + or device than the launch stream. + Dependencies of the node + Event for the node + Returns newly created node - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Creates an external semaphore signal node and adds it to a graph + Creates a new external semaphore signal node and adds it to \p hGraph with \p + numDependencies dependencies specified via \p dependencies and arguments specified + in \p nodeParams.It is possible for \p numDependencies to be 0, in which case the + node will be placed at the root of the graph. \p dependencies may not have any + duplicate entries. A handle to the new node will be returned in \p phGraphNode. + Dependencies of the node + Parameters for the node + Returns newly created node - + - Synchronize every synchronous memory operation initiated on this region + Creates an external semaphore wait node and adds it to a graph + Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + Dependencies of the node + Parameters for the node + Returns newly created node - + - A process-wide unique ID for an allocated memory region + Creates an allocation node and adds it to a graph + Creates a new allocation node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + Dependencies of the node + Parameters for the node + Returns newly created node - + - Indicates if the pointer points to managed memory + Creates a memory free node and adds it to a graph + Creates a new memory free node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + Dependencies of the node + Parameters for the node + Returns newly created node - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Adds a node of arbitrary type to a graph + + Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies + dependencies specified via \p dependencies. \p numDependencies may be 0. + \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have + any duplicate entries. + + \p nodeParams is a tagged union. The node type should be specified in the \p type field, + and type-specific parameters in the corresponding union member. All unused bytes - that + is, \p reserved0 and all bytes past the utilized union member - must be set to zero. + It is recommended to use brace initialization or memset to ensure all bytes are + initialized. + + Note that for some node types, \p nodeParams may contain "out parameters" which are + modified during the call, such as \p nodeParams->alloc.dptr. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - + Dependencies of the node + Specification of the node + Returns newly created node - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. + Adds a node of arbitrary type to a graph - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies + dependencies specified via \p dependencies. \p numDependencies may be 0. + \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have + any duplicate entries. - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. + \p nodeParams is a tagged union. The node type should be specified in the \p type field, + and type-specific parameters in the corresponding union member. All unused bytes - that + is, \p reserved0 and all bytes past the utilized union member - must be set to zero. + It is recommended to use brace initialization or memset to ensure all bytes are + initialized. - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Note that for some node types, \p nodeParams may contain "out parameters" which are + modified during the call, such as \p nodeParams->alloc.dptr. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + Dependencies of the node + Optional edge data for the dependencies. If NULL, the data is assumed to be default (zeroed) for all dependencies. + Specification of the node + Returns newly created node - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Creates a memory free node and adds it to a graph + Creates a new memory free node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Dependencies of the node + Parameters for the node + Returns newly created node - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Clones a graph + This function creates a copy of the original Graph. + All parameters are copied into the cloned graph. The original graph may be modified + after this call without affecting the clone. + Child graph nodes in the original graph are recursively copied into the clone. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for - + - Enumerator class for CudaManagedMemory_int4 + Finds a cloned version of a node + This function returns the node corresponding to originalNode + in the original graph. + This cloned graph must have been cloned from the original Graph via its Clone() method. + OriginalNode must have been in that graph at the time of the call to + Clone(), and the corresponding cloned node in this graph must not have + been removed. The cloned node is then returned. + - + - + Returns a graph's nodes - + - + - - - - - - - - - - - - - - - - + Returns a graph's root nodes - - - A variable located in managed memory. - Type: uint - - - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Returns a graph's dependency edges - In elements - + + - + - Creates a new CudaManagedMemory from definition in cu-file. + Returns a graph's dependency edges - The module where the variable is defined in. - The variable name as defined in the cu-file. + + + - + - Creates a new CudaManagedMemory from definition in cu-file. + Adds dependency edges to a graph + Elements in from and to at corresponding indices define a dependency. + Each node in from and to must belong to this Graph. + Specifying an existing dependency will return an error. - The kernel which module defines the variable. - The variable name as defined in the cu-file. + + - + - For dispose + Adds dependency edges to a graph + Elements in from and to at corresponding indices define a dependency. + Each node in from and to must belong to this Graph. + Specifying an existing dependency will return an error. + + + - + - Dispose + Removes dependency edges to a graph + Elements in from and to at corresponding indices define a dependency. + Each node in from and to must belong to this Graph. + Specifying an existing dependency will return an error. + + - + - For IDisposable + Removes dependency edges to a graph + Elements in from and to at corresponding indices define a dependency. + Each node in from and to must belong to this Graph. + Specifying an existing dependency will return an error. - + + + - + - UIntPtr to managed memory. + Creates an executable graph from a graph + Instantiates this Graph as an executable graph. The graph is validated for any + structural constraints or intra-node constraints which were not previously + validated. If instantiation is successful, a handle to the instantiated graph + is returned. - + - CUdeviceptr to managed memory. + Creates an executable graph from a graph + Instantiates \p hGraph as an executable graph. The graph is validated for any + structural constraints or intra-node constraints which were not previously + validated.If instantiation is successful, a handle to the instantiated graph + is returned in \p phGraphExec. - + - Size in bytes + Write a DOT file describing graph structure + Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. + By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + \p flags can be specified to write more detailed information about each node type such as + parameter values, kernel attributes, node and function handles. + The path to write the DOT file to + Flags from CUgraphDebugDot_flags for specifying which additional node information to write - + - Size in elements + Create a conditional handle + Creates a conditional handle associated with \p hGraph. + The conditional handle must be associated with a conditional node in this graph or one of its children. + Handles not associated with a conditional node may cause graph instantiation to fail. + Handles can only be set from the context with which they are associated. + Context for the handle and associated conditional node. + Optional initial value for the conditional variable. + Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0. - + - Access array per element. + Returns the inner graph handle - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Represents an executable Cuda graph. - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + For clone graph method - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + For dispose - + - The describing the physical location of a pointer + Dispose - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + For IDisposable + - + - The address at which a pointer's memory may be accessed on the host + Launches an executable graph in a stream. + Only one instance of GraphExec may be executing + at a time. Each launch is ordered behind both any previous work in Stream + and any previous launches of GraphExec.To execute a graph concurrently, it must be + instantiated multiple times into multiple executable graphs. + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Uploads an executable graph in a stream + Uploads \p hGraphExec to the device in \p hStream without executing it.Uploads of + the same \p hGraphExec will be serialized.Each upload is ordered behind both any + previous work in \p hStream and any previous launches of \p hGraphExec. + Stream in which to upload the graph + - + - Synchronize every synchronous memory operation initiated on this region + Sets the parameters for a kernel node in the given graphExec + Sets the parameters of a kernel node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph.The \p func field + of \p nodeParams cannot be modified and must match the original value. + All other values can be modified. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + - - - A process-wide unique ID for an allocated memory region - + + + Sets the parameters for a memcpy node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p copyParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The source and destination memory in \p copyParams must be allocated from the same + contexts as the original source and destination memory. Both the instantiation-time + memory operands and the memory operands in \p copyParams must be 1-dimensional. + Zero-length operations are not supported. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or + either the original or new memory operands are multidimensional. + + + + + Sets the parameters for a memset node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p memsetParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The destination memory in \p memsetParams must be allocated from the same + contexts as the original destination memory. Both the instantiation-time + memory operand and the memory operand in \p memsetParams must be 1-dimensional. + Zero-length operations are not supported. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or + either the original or new memory operand are multidimensional. + + + + + Sets the parameters for a host node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p nodeParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + + + + + Updates node parameters in the child graph node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained + in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. + \p hNode must remain in the graph which was used to instantiate \p hGraphExec. + Changed edges to and from \p hNode are ignored. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. \p hNode is also + not modified by this call. + The topology of \p childGraph, as well as the node insertion order, must match that + of the graph contained in \p hNode. See::cuGraphExecUpdate() for a list of restrictions + on what can be updated in an instantiated graph.The update is recursive, so child graph + nodes contained within the top level child graph will also be updated. + + + + + Sets the parameters for an external semaphore signal node in the given graphExec + Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + Changing \p nodeParams->numExtSems is not supported. + + + + + Sets the parameters for an external semaphore wait node in the given graphExec + Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + Changing \p nodeParams->numExtSems is not supported. + + + + + Sets the parameters for a batch mem op node in the given graphExec + Sets the parameters of a batch mem op node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The following fields on operations may be modified on an executable graph: + op.waitValue.address + op.waitValue.value[64] + op.waitValue.flags bits corresponding to wait type (i.e.CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified) + op.writeValue.address + op.writeValue.value[64] + Other fields, such as the context, count or type of operations, and other types of operations such as membars, may not be modified. + \p hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + The paramArray inside \p nodeParams is copied and therefore it can be + freed after the call returns. + + + + + Update's a graph node's parameters in an instantiated graph + Sets the parameters of a node in an executable graph \p hGraphExec.The node is identified + by the corresponding node \p hNode in the non-executable graph from which the executable + graph was instantiated. \p hNode must not have been removed from the original graph. + + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + + Allowed changes to parameters on executable graphs are as follows: + Node type | Allowed changes + kernel | See ::cuGraphExecKernelNodeSetParams + memcpy | Addresses for 1-dimensional copies if allocated in same context; see::cuGraphExecMemcpyNodeSetParams + memset | Addresses for 1-dimensional memsets if allocated in same context; see::cuGraphExecMemsetNodeSetParams + host | Unrestricted + child graph | Topology must match and restrictions apply recursively; see::cuGraphExecUpdate + event wait | Unrestricted + event record | Unrestricted + external semaphore signal | Number of semaphore operations cannot change + external semaphore wait | Number of semaphore operations cannot change + memory allocation | API unsupported + memory free | API unsupported + batch memops | Addresses, values, and operation type for wait operations; see::cuGraphExecBatchMemOpNodeSetParams + + Corresponding node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + Sets the event for an event record node in the given graphExec + Sets the event of an event record node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + + + + Sets the event for an event record node in the given graphExec + Sets the event of an event record node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + + + + Check whether an executable graph can be updated with a graph and perform the update if possible + Updates the node parameters in the instantiated graph specified by \p hGraphExec with the node parameters in a topologically identical graph specified by \p hGraph. + Limitations: + - Kernel nodes: + - The owning context of the function cannot change. + - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + to a function which uses CDP. + - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. + - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the + priority attribute cannot change.Equality is checked on the originally requested + priority values, before they are clamped to the device's supported range. + - If \p hGraphExec was not instantiated for device launch, a node whose function originally did not use device-side cudaGraphLaunch() cannot be updated to a function which uses + device-side cudaGraphLaunch() unless the node resides on the same context as nodes which contained such calls at instantiate-time.If no such calls were present at instantiation, + these updates cannot be performed at all. + - Memset and memcpy nodes: + - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + - The source/destination memory must be allocated from the same contexts as the original source/destination memory. + - Only 1D memsets can be changed. + - Additional memcpy node restrictions: + - Changing either the source or destination memory type(i.e.CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_ARRAY, etc.) is not supported. + - External semaphore wait nodes and record nodes: + - Changing the number of semaphores is not supported. + Note: The API may add further restrictions in future releases. The return code should always be checked. + cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under the following conditions: + - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode + is set to NULL. + - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of the exit nodes in hGraph. + - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with, + in which case resultInfo->errorNode is set to the node from \p hGraph. + - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node + from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode + will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency + does not match when the nodes are already paired based on other edges examined in the graph. + cuGraphExecUpdate sets the result member of \p resultInfo to: + - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. + - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed + - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case + \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported + way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + If the update fails for a reason not listed above, the result member of \p resultInfo will be set + to CU_GRAPH_EXEC_UPDATE_ERROR.If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS. + cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.It returns + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included changes which violated constraints specific to instantiated graph update. + + The graph containing the updated parameters + the error info structure + + + + Enables or disables the specified node in the given graphExec + Sets \p hNode to be either enabled or disabled.Disabled nodes are functionally equivalent + to empty nodes until they are reenabled.Existing node parameters are not affected by + disabling/enabling the node. + The node is identified by the corresponding node \p hNode in the non-executable + graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + \note Currently only kernel, memset and memcpy nodes are supported. + + Node from the graph from which graphExec was instantiated + Node is enabled if != 0, otherwise the node is disabled + + + + Query whether a node in the given graphExec is enabled + Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. + The node is identified by the corresponding node \p hNode in the non-executable + graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph. + \note Currently only kernel, memset and memcpy nodes are supported. + + Node from the graph from which graphExec was instantiated + the enabled status of the node - + - Indicates if the pointer points to managed memory + Returns the inner executable graph handle - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + Query the instantiation flags of an executable graph + Returns the flags that were passed to instantiation for the given executable graph. + ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does + not affect the resulting executable graph. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + A list of JIT compiler / linker option passed to Cuda. + If buffer options are used (i.e. InfoLogBuffer and ErrorLogBuffer), this + collection should only be used once as buffer size is overwritten by Cuda. + To copy data from unmanaged to managed memory, call after + the API call that produced output data. + Maximum number of options is limited to 30. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - - - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. - - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Add a single option to the collection. - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Option to add - + - Enumerator class for CudaManagedMemory_uint + A multiple options to the collection. + Options to add - + - + Copy data from unmanaged to managed memory - - + - + Reset values returned from Cuda API for info and error buffers. - + - + For dispose - + - + Dispose - + - + For IDisposable - + - + - A variable located in managed memory. - Type: uint1 + Online compiler options - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Option value converted to (void *) - In elements - - + - Creates a new CudaManagedMemory from definition in cu-file. + Option - The module where the variable is defined in. - The variable name as defined in the cu-file. - - - Creates a new CudaManagedMemory from definition in cu-file. - - The kernel which module defines the variable. - The variable name as defined in the cu-file. + + - + For dispose - + Dispose - + For IDisposable - + - UIntPtr to managed memory. + Max number of registers that a thread may use. + Option type: unsigned int + Applies to: compiler only - + - CUdeviceptr to managed memory. + Max number of registers that a thread may use. + Option type: unsigned int + Applies to: compiler only + - + - Size in bytes + IN: Specifies minimum number of threads per block to target compilation + for + OUT: Returns the number of threads the compiler actually targeted. + This restricts the resource utilization of the compiler (e.g. max + registers) such that a block with the given number of threads should be + able to launch based on register limitations. Note, this option does not + currently take into account any other resource limitations, such as + shared memory utilization. + Option type: unsigned int + Applies to: compiler only - + - Size in elements + IN: Specifies minimum number of threads per block to target compilation + for + OUT: Returns the number of threads the compiler actually targeted. + This restricts the resource utilization of the compiler (e.g. max + registers) such that a block with the given number of threads should be + able to launch based on register limitations. Note, this option does not + currently take into account any other resource limitations, such as + shared memory utilization. + Option type: unsigned int + Applies to: compiler only + - + - Access array per element. + Returns the number of threads the compiler actually targeted. + This restricts the resource utilization of the compiler (e.g. max + registers) such that a block with the given number of threads should be + able to launch based on register limitations. Note, this option does not + currently take into account any other resource limitations, such as + shared memory utilization. + The value is only valid after a succesful call to - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Returns a float value in the option of the wall clock time, in + milliseconds, spent creating the cubin + Option type: float + Applies to: compiler and linker - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Returns a float value in the option of the wall clock time, in + milliseconds, spent creating the cubin + Option type: float + Applies to: compiler and linker - managed variable - newly allocated host variable with value from managed memory - + - The on which a pointer was allocated or registered + Returns a float value in the option of the wall clock time, in + milliseconds, spent creating the cubin + Option type: float + Applies to: compiler and linker + The value is only valid after a succesful call to - + - The describing the physical location of a pointer + Pointer to a buffer in which to print any log messsages from PTXAS + that are informational in nature (the buffer size is specified via + option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) + Option type: char* + Applies to: compiler and linker + You must free the internal buffer array manually after use by calling ! - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Pointer to a buffer in which to print any log messsages from PTXAS + that are informational in nature + Option type: char* + Applies to: compiler and linker + You must free the internal buffer array manually after use by calling ! + Size of the internal buffer array - + - The address at which a pointer's memory may be accessed on the host + ManagedCuda allocates an byte array as buffer and pins it in order to pass it to Cuda. + You must free the buffer manually if the buffer is not needed anymore. - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Returns the buffer converted to string. + The value is only valid after a succesful call to - + - Synchronize every synchronous memory operation initiated on this region + + - + - A process-wide unique ID for an allocated memory region + Pointer to a buffer in which to print any log messages from PTXAS that + reflect errors + Option type: char* + Applies to: compiler and linker + You must free the internal buffer array manually after use by calling ! - + - Indicates if the pointer points to managed memory + Pointer to a buffer in which to print any log messages from PTXAS that + reflect errors + Option type: char* + Applies to: compiler and linker + You must free the internal buffer array manually after use by calling ! + - + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + ManagedCuda allocates an byte array as buffer and pins it in order to pass it to Cuda. + You must free the buffer manually if the buffer is not needed anymore. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + + + Returns the buffer converted to string. + The value is only valid after a succesful call to + + + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Level of optimizations to apply to generated code (0 - 4), with 4 + being the default and highest level of optimizations. + Option type: unsigned int + Applies to: compiler only - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Level of optimizations to apply to generated code (0 - 4), with 4 + being the default and highest level of optimizations. + Option type: unsigned int + Applies to: compiler only - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + Level of optimizations to apply to generated code (0 - 4), with 4 + being the default and highest level of optimizations. - + - Enumerator class for CudaManagedMemory_uint1 + No option value required. Determines the target based on the current + attached context (default) + Option type: No option value needed + Applies to: compiler and linker - + - + Determines the target based on the current attached context (default) + Option type: No option value needed + Applies to: compiler and linker - - + - + Target is chosen based on supplied . + Option type: unsigned int for enumerated type + Applies to: compiler and linker - + - + Target is chosen based on supplied ::CUjit_target_enum. + Option type: unsigned int for enumerated type ::CUjit_target_enum + Applies to: compiler and linker + - + - + Specifies choice of fallback strategy if matching cubin is not found. + Choice is based on supplied . + Option type: unsigned int for enumerated type + Applies to: compiler only - + - + Specifies choice of fallback strategy if matching cubin is not found. + Choice is based on supplied . + Option type: unsigned int for enumerated type + Applies to: compiler only - + - + - A variable located in managed memory. - Type: uint2 + Specifies whether to create debug information in output (-g) (0: false, default) + Option type: int + Applies to: compiler and linker - + - Creates a new CudaManagedMemory and allocates the memory on host/device. + Specifies whether to create debug information in output (-g) (0: false, default) + Option type: int + Applies to: compiler and linker - In elements - + - + - Creates a new CudaManagedMemory from definition in cu-file. + Generate verbose log messages (0: false, default) + Option type: int + Applies to: compiler and linker - The module where the variable is defined in. - The variable name as defined in the cu-file. - + - Creates a new CudaManagedMemory from definition in cu-file. + Generate verbose log messages (0: false, default) + Option type: int + Applies to: compiler and linker - The kernel which module defines the variable. - The variable name as defined in the cu-file. + - + - For dispose + Generate line number information (-lineinfo) (0: false, default) + Option type: int + Applies to: compiler only - + - Dispose + Generate line number information (-lineinfo) (0: false, default) + Option type: int + Applies to: compiler only + - + - For IDisposable + Specifies whether to enable caching explicitly (-dlcm) + Choice is based on supplied . + Option type: unsigned int for enumerated type + Applies to: compiler only - - + - UIntPtr to managed memory. + Specifies whether to enable caching explicitly (-dlcm) + Choice is based on supplied . + Option type: unsigned int for enumerated type + Applies to: compiler only + - + - CUdeviceptr to managed memory. + Array of device symbol names that will be relocated to the corresponding + host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES. + Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. + When loading a device module, driver will relocate all encountered + unresolved symbols to the host addresses. + It is only allowed to register symbols that correspond to unresolved + global variables. + It is illegal to register the same device symbol at multiple addresses. + Option type: const char ** + Applies to: dynamic linker only - + - Size in bytes + Array of device symbol names that will be relocated to the corresponding + host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES. + Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries. + When loading a device module, driver will relocate all encountered + unresolved symbols to the host addresses. + It is only allowed to register symbols that correspond to unresolved + global variables. + It is illegal to register the same device symbol at multiple addresses. + Option type: const char ** + Applies to: dynamic linker only + + - + - Size in elements + ManagedCuda allocates an array as buffer and pins it in order to pass it to Cuda. + You must free the buffer manually if the buffer is not needed anymore. - + - Access array per element. + - index in elements - + - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Enable link-time optimization (-dlto) for device code (0: false, default) + Option type: int + Applies to: compiler and linker - + - Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + Enable link-time optimization (-dlto) for device code (0: false, default) + Option type: int + Applies to: compiler and linker - managed variable - newly allocated host variable with value from managed memory + - + - The on which a pointer was allocated or registered + Control single-precision denormals (-ftz) support (0: false, default). + 1 : flushes denormal values to zero + 0 : preserves denormal values + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - + - The describing the physical location of a pointer + Control single-precision denormals (-ftz) support (0: false, default). + 1 : flushes denormal values to zero + 0 : preserves denormal values + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO + - + - The address at which a pointer's memory may be accessed on the device - Except in the exceptional disjoint addressing cases, the value returned will equal the input value. + Control single-precision floating-point division and reciprocals + (-prec-div) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO - + - The address at which a pointer's memory may be accessed on the host + Control single-precision floating-point division and reciprocals + (-prec-div) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int + Applies to: link-time optimization specified with CU_JIT_LTO + - + - A pair of tokens for use with the nv-p2p.h Linux kernel interface + Control single-precision floating-point square root + (-prec-sqrt) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO - + - Synchronize every synchronous memory operation initiated on this region + Control single-precision floating-point square root + (-prec-sqrt) support (1: true, default). + 1 : Enables the IEEE round-to-nearest mode + 0 : Enables the fast approximation mode + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO + - + - A process-wide unique ID for an allocated memory region + Enable/Disable the contraction of floating-point multiplies + and adds/subtracts into floating-point multiply-add (-fma) + operations (1: Enable, default; 0: Disable). + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO - + - Indicates if the pointer points to managed memory + Enable/Disable the contraction of floating-point multiplies + and adds/subtracts into floating-point multiply-add (-fma) + operations (1: Enable, default; 0: Disable). + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO + - + + + Array of kernel names that should be preserved at link time while others + can be removed.\n + Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n + Note that kernel names can be mangled by the compiler in which case the + mangled name needs to be specified.\n + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name.\n + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all kernels with those names. This can be avoided by providing + a more specific name like "barfoobaz".\n + Option type: const char **\n + Applies to: dynamic linker only + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + Array of kernel names that should be preserved at link time while others + can be removed.\n + Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n + Note that kernel names can be mangled by the compiler in which case the + mangled name needs to be specified.\n + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name.\n + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all kernels with those names. This can be avoided by providing + a more specific name like "barfoobaz".\n + Option type: const char **\n + Applies to: dynamic linker only + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + - Attach memory to a stream asynchronously - - Enqueues an operation in hStream to specify stream association of - length bytes of memory starting from dptr. This function is a - stream-ordered operation, meaning that it is dependent on, and will - only take effect when, previous work in stream has completed. Any - previous association is automatically replaced. - - dptr must point to an address within managed memory space declared - using the __managed__ keyword or allocated with cuMemAllocManaged. - - length must be zero, to indicate that the entire allocation's - stream association is being changed. Currently, it's not possible - to change stream association for a portion of an allocation. - - The stream association is specified using flags which must be - one of . - If the flag is specified, the memory can be accessed - by any stream on any device. - If the flag is specified, the program makes a guarantee - that it won't access the memory on the device from any stream. - If the flag is specified, the program makes a guarantee - that it will only access the memory on the device from hStream. It is illegal - to attach singly to the NULL stream, because the NULL stream is a virtual global - stream and not a specific stream. An error will be returned in this case. - - When memory is associated with a single stream, the Unified Memory system will - allow CPU access to this memory region so long as all operations in hStream - have completed, regardless of whether other streams are active. In effect, - this constrains exclusive ownership of the managed memory region by - an active GPU to per-stream activity instead of whole-GPU activity. - - Accessing memory on the device from streams that are not associated with - it will produce undefined results. No error checking is performed by the - Unified Memory system to ensure that kernels launched into other streams - do not access this region. - - It is a program's responsibility to order calls to - via events, synchronization or other means to ensure legal access to memory - at all times. Data visibility and coherency will be changed appropriately - for all kernels which follow a stream-association change. - - If hStream is destroyed while data is associated with it, the association is - removed and the association reverts to the default visibility of the allocation - as specified at cuMemAllocManaged. For __managed__ variables, the default - association is always . Note that destroying a stream is an - asynchronous operation, and as a result, the change to default association won't - happen until all work in the stream has completed. - + ManagedCuda allocates an array as buffer and pins it in order to pass it to Cuda. + You must free the buffer manually if the buffer is not needed anymore. - Stream in which to enqueue the attach operation - Length of memory (must be zero) - Must be one of - - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. + + + + + + Array of variable names (__device__ and/or __constant__) that should be + preserved at link time while others can be removed.\n + Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n + Note that variable names can be mangled by the compiler in which case the + mangled name needs to be specified.\n + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name.\n + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all variables with those names. This can be avoided by providing + a more specific name like "barfoobaz".\n + Option type: const char **\n + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + Array of variable names (__device__ and/or __constant__) that should be + preserved at link time while others can be removed.\n + Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n + Note that variable names can be mangled by the compiler in which case the + mangled name needs to be specified.\n + Wildcard "*" can be used to represent zero or more characters instead of + specifying the full or mangled name.\n + It is important to note that the wildcard "*" is also added implicitly. + For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + thus preserve all variables with those names. This can be avoided by providing + a more specific name like "barfoobaz".\n + Option type: const char **\n + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + + ManagedCuda allocates an array as buffer and pins it in order to pass it to Cuda. + You must free the buffer manually if the buffer is not needed anymore. + + + + - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. + - + + + This option serves as a hint to enable the JIT compiler/linker + to remove constant (__constant__) and device (__device__) variables + unreferenced in device code (Disabled by default).\n + Note that host references to constant and device variables using APIs like + ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless + the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + + This option serves as a hint to enable the JIT compiler/linker + to remove constant (__constant__) and device (__device__) variables + unreferenced in device code (Disabled by default).\n + Note that host references to constant and device variables using APIs like + ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless + the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n + Option type: int\n + Applies to: link-time optimization specified with CU_JIT_LTO + + Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + + + + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Generate position independent code (0: false)\n + Option type: int\n + Applies to: compiler only - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for - + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. - - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. - - Note that this function is asynchronous with respect to the host and all work - on other devices. + Generate position independent code (0: false)\n + Option type: int\n + Applies to: compiler only - managed memory variable - Advice to be applied for the specified memory range - Device to apply the advice for + - + - Enumerator class for CudaManagedMemory_uint2 + Represents an executable Cuda graph. - + - + Load library - - + - + Load library - + - + Load library - + - + Load library - + - + Get library from CUkernel - - + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Make sure the library image arrays are zero terminated by appending a zero + + + + + Returns a module handle + Returns in \p pMod the module handle associated with the current context located in library \p library. + If module handle is not found, the call returns::CUDA_ERROR_NOT_FOUND. + + + + + Returns a kernel handle + Returns in \p pKernel the handle of the kernel with name \p name located in library \p library. + If kernel handle is not found, the call returns::CUDA_ERROR_NOT_FOUND. + + + + + Returns a global device pointer + Returns in \p *dptr and \p *bytes the base pointer and size of the global with + name \p name for the requested library \p library and the current context. + If no global for the requested name \p name exists, the call returns::CUDA_ERROR_NOT_FOUND. + One of the parameters \p dptr or \p bytes (not both) can be NULL in which case it is ignored. + + Name of global to retrieve + CudaDeviceVariable + + + + Returns a CudaKernel (the managedCuda wrapper for CUfunction, not to be confused with CUkernel) + + + + + Retrieve the kernel handles within a library. + Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib. + The returned kernel handle becomes invalid when the library is unloaded. + + Buffer where the kernel handles are returned to + Maximum number of kernel handles may be returned to the buffer + + + + Retrieve all the kernel handles within a library. + The returned kernel handle becomes invalid when the library is unloaded. + + + + + Retrieve all the kernel handles within a library. + (the managedCuda wrapper for CUfunction, not to be confused with CUkernel). + + + + + Returns the inner library handle + + + + + Returns the number of kernels within the library + + + + + A list of library load option passed to Cuda. + Maximum number of options is limited to 30. + + + + + + + + Add a single option to the collection. + + Option to add + + + + A multiple options to the collection. + + Options to add + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Online compiler options + + + + + Option value converted to (void *) + + + + + Option + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Unknown + + + + + Unknown + + + + + + Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved. + Specifying this option will let the driver know that \p code can be accessed at any point + until ::cuLibraryUnload(). The default behavior is for the driver to allocate and + maintain its own copy of \p code. Note that this is only a memory usage optimization + hint and the driver can choose to ignore it if required. + Specifying this option with ::cuLibraryLoadFromFile() is invalid and + will return ::CUDA_ERROR_INVALID_VALUE. + + + + + Specifes that the argument \p code passed to ::cuLibraryLoadData() will be preserved. + Specifying this option will let the driver know that \p code can be accessed at any point + until ::cuLibraryUnload(). The default behavior is for the driver to allocate and + maintain its own copy of \p code. Note that this is only a memory usage optimization + hint and the driver can choose to ignore it if required. + Specifying this option with ::cuLibraryLoadFromFile() is invalid and + will return ::CUDA_ERROR_INVALID_VALUE. + + + + + + A pending JIT linker invocation. + + + + + Creates a pending JIT linker invocation. + + + + + Creates a pending JIT linker invocation. + + Collection of linker and compiler options + + + + For dispose + + + + + Dispose + Destroys state for a JIT linker invocation. + + + + + For IDisposable. + Destroys state for a JIT linker invocation. + + + + + + Add an input to a pending linker invocation. + + The input data. PTX must be NULL-terminated. + The type of the input data. + An optional name for this input in log messages. + Collection of linker and compiler options + + + + Add an input to a pending linker invocation. + + The input data. PTX must be NULL-terminated. + The type of the input data. + An optional name for this input in log messages. + Collection of linker and compiler options + + + + Add an input to a pending linker invocation. + + Path to the input file. + The type of the input data. + Collection of linker and compiler options + + + + Complete a pending linker invocation. + Completes the pending linker action and returns the cubin image for the linked + device code, which can be used with ::cuModuleLoadData. + + + A variable located in managed memory. - Type: uint3 + Type: byte - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -19946,7 +16506,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -19970,7 +16530,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -20033,7 +16654,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -20095,162 +16716,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uint3 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_byte + + + - + - + - + - + - + A variable located in managed memory. - Type: uint4 + Type: uchar1 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -20307,7 +17153,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -20331,19 +17177,80 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - - - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. - - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read duplicated copies of the data will be freed no later than the next write access to that data. - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the @@ -20394,7 +17301,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -20456,162 +17363,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_uint4 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_uchar1 + + + - + - + - + - + - + A variable located in managed memory. - Type: long + Type: uchar2 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -20668,7 +17800,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -20692,7 +17824,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -20755,7 +17948,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -20817,162 +18010,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_long + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_uchar2 + + + - + - + - + - + - + A variable located in managed memory. - Type: long1 + Type: uchar3 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -21029,7 +18447,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -21053,7 +18471,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21116,7 +18595,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21178,162 +18657,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_long1 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_uchar3 + + + - + - + - + - + - + A variable located in managed memory. - Type: long2 + Type: uchar4 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -21390,7 +19094,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -21414,7 +19118,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21477,7 +19242,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21539,162 +19304,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_long2 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_uchar4 + + + - + - + - + - + - + A variable located in managed memory. - Type: ulong + Type: sbyte - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -21751,7 +19741,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -21775,7 +19765,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21838,7 +19889,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -21900,162 +19951,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_ulong + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_sbyte + + + - + - + - + - + - + A variable located in managed memory. - Type: ulong1 + Type: char1 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -22112,7 +20388,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -22136,7 +20412,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22199,7 +20536,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22261,162 +20598,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_ulong1 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_char1 + + + - + - + - + - + - + A variable located in managed memory. - Type: ulong2 + Type: char2 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -22473,7 +21035,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -22497,7 +21059,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22560,7 +21183,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22622,162 +21245,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_ulong2 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_char2 + + + - + - + - + - + - + A variable located in managed memory. - Type: float + Type: char3 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -22834,7 +21682,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -22858,7 +21706,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22921,7 +21830,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -22983,162 +21892,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_float + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_char3 + + + - + - + - + - + - + A variable located in managed memory. - Type: float1 + Type: char4 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -23195,7 +22329,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -23219,7 +22353,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -23282,7 +22477,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -23344,162 +22539,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_float1 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_char4 + + + - + - + - + - + - + A variable located in managed memory. - Type: float2 + Type: short - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -23556,7 +22976,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -23580,7 +23000,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -23643,7 +23124,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -23705,162 +23186,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_float2 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_short + + + - + - + - + - + - + A variable located in managed memory. - Type: float3 + Type: short1 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -23917,7 +23623,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -23941,7 +23647,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24004,7 +23771,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24066,162 +23833,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_float3 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_short1 + + + - + - + - + - + - + A variable located in managed memory. - Type: float4 + Type: short2 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -24278,7 +24270,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -24302,7 +24294,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24365,7 +24418,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24427,162 +24480,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_float4 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_short2 + + + - + - + - + - + - + A variable located in managed memory. - Type: double + Type: short3 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -24639,7 +24917,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -24663,7 +24941,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24726,7 +25065,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -24788,162 +25127,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_double + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_short3 + + + - + - + - + - + - + A variable located in managed memory. - Type: double1 + Type: short4 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -25000,7 +25564,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -25024,7 +25588,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25087,7 +25712,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25149,162 +25774,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_double1 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_short4 + + + - + - + - + - + - + A variable located in managed memory. - Type: double2 + Type: ushort - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -25361,7 +26211,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -25385,7 +26235,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25448,7 +26359,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25510,162 +26421,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_double2 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_ushort + + + - + - + - + - + - + A variable located in managed memory. - Type: cuDoubleComplex + Type: ushort1 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -25722,7 +26858,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -25746,7 +26882,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25809,7 +27006,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -25871,162 +27068,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_cuDoubleComplex + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_ushort1 + + + - + - + - + - + - + A variable located in managed memory. - Type: cuDoubleReal + Type: ushort2 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -26083,7 +27505,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -26107,7 +27529,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26170,7 +27653,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26232,162 +27715,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_cuDoubleReal + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_ushort2 + + + - + - + - + - + - + A variable located in managed memory. - Type: cuFloatComplex + Type: ushort3 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -26444,7 +28152,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -26468,7 +28176,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26531,7 +28300,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26593,162 +28362,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_cuFloatComplex + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_ushort3 + + + - + - + - + - + - + A variable located in managed memory. - Type: cuFloatReal + Type: ushort4 - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -26805,7 +28799,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -26829,7 +28823,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26892,7 +28947,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -26954,162 +29009,387 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_cuFloatReal + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + + + + + Enumerator class for CudaManagedMemory_ushort4 + + + - + - + - + - + - + A variable located in managed memory. - Type: dim3 + Type: int - + Creates a new CudaManagedMemory and allocates the memory on host/device. In elements - + Creates a new CudaManagedMemory from definition in cu-file. The module where the variable is defined in. The variable name as defined in the cu-file. - + Creates a new CudaManagedMemory from definition in cu-file. The kernel which module defines the variable. The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + UIntPtr to managed memory. - + CUdeviceptr to managed memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. managed variable newly allocated host variable with value from managed memory - + The on which a pointer was allocated or registered - + The describing the physical location of a pointer - + The address at which a pointer's memory may be accessed on the device Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + The address at which a pointer's memory may be accessed on the host - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + Synchronize every synchronous memory operation initiated on this region - + A process-wide unique ID for an allocated memory region - + Indicates if the pointer points to managed memory - + Attach memory to a stream asynchronously @@ -27166,7 +29446,7 @@ Must be one of - + Prefetches memory to the specified destination device Prefetches memory to the specified destination device. devPtr is the @@ -27190,7 +29470,68 @@ Stream to enqueue prefetch operation Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -27253,7 +29594,7 @@ Advice to be applied for the specified memory range Device to apply the advice for - + Advise about the usage of a given memory range Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. @@ -27315,8489 +29656,19355 @@ Advice to be applied for the specified memory range Device to apply the advice for - + - Enumerator class for CudaManagedMemory_dim3 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + + + Enumerator class for CudaManagedMemory_int + + + + - + - + - + - - + - Number of channels in array + + - + - One channel, e.g. float1, int1, float, int + A variable located in managed memory. + Type: int1 - + - Two channels, e.g. float2, int2 + Creates a new CudaManagedMemory and allocates the memory on host/device. + In elements + - + - Four channels, e.g. float4, int4 + Creates a new CudaManagedMemory from definition in cu-file. + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - A mipmapped Cuda array + Creates a new CudaManagedMemory from definition in cu-file. + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Creates a CUDA mipmapped array according to descriptor. - Width, Height, and Depth are the width, height, and depth of the CUDA array (in elements); the following - types of CUDA arrays can be allocated: - – A 1D mipmapped array is allocated if Height and Depth extents are both zero. - – A 2D mipmapped array is allocated if only Depth extent is zero. - – A 3D mipmapped array is allocated if all three extents are non-zero. - – A 1D layered CUDA mipmapped array is allocated if only Height is zero and the - flag is set. Each layer is a 1D array. The number of layers is determined by the depth extent. - – A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and the - flag is set. Each layer is a 2D array. The number of layers is determined by the depth extent. - – A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - flag is set. Width must be equal to Height, and Depth must be six. A - cubemap is a special type of 2D layered CUDA array, where the six layers represent the six faces of a - cube. The order of the six layers in memory is the same as that listed in CUarray_cubemap_face. - – A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, - and flags are set. Width must be equal - to Height, and Depth must be a multiple of six. A cubemap layered CUDA array is a special type of - 2D layered CUDA array that consists of a collection of cubemaps. The first six layers represent the first - cubemap, the next six layers form the second cubemap, and so on. - Flags may be set to: - – to enable creation of layered CUDA mipmapped arrays. If this flag is set, - Depth specifies the number of layers, not the depth of a 3D array. - – to enable creation of mipmapped cubemaps. If this flag is set, Width - must be equal to Height, and Depth must be six. If the CUDA_ARRAY3D_LAYERED flag is also set, - then Depth must be a multiple of six. - – to indicate that the CUDA mipmapped array will be used for - texture gather. Texture gather can only be performed on 2D CUDA mipmapped arrays. + Creates a new CudaManagedMemory from definition in cu-file. - mipmapped array descriptor - Number of mipmap levels. This value is clamped to the range [1, 1 + floor(log2(max(width, height, depth)))] + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Creates a CUDA mipmapped array according to descriptor. - Width, Height, and Depth are the width, height, and depth of the CUDA array (in elements); the following - types of CUDA arrays can be allocated: - – A 1D mipmapped array is allocated if Height and Depth extents are both zero. - – A 2D mipmapped array is allocated if only Depth extent is zero. - – A 3D mipmapped array is allocated if all three extents are non-zero. - – A 1D layered CUDA mipmapped array is allocated if only Height is zero and the - flag is set. Each layer is a 1D array. The number of layers is determined by the depth extent. - – A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and the - flag is set. Each layer is a 2D array. The number of layers is determined by the depth extent. - – A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - flag is set. Width must be equal to Height, and Depth must be six. A - cubemap is a special type of 2D layered CUDA array, where the six layers represent the six faces of a - cube. The order of the six layers in memory is the same as that listed in CUarray_cubemap_face. - – A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, - and flags are set. Width must be equal - to Height, and Depth must be a multiple of six. A cubemap layered CUDA array is a special type of - 2D layered CUDA array that consists of a collection of cubemaps. The first six layers represent the first - cubemap, the next six layers form the second cubemap, and so on. + Creates a new CudaManagedMemory from definition in cu-file. - Array format - Array width. See general description. - Array height. See general description. - Array depth or layer count. See general description. - number of channels - Flags may be set to: - – to enable creation of layered CUDA mipmapped arrays. If this flag is set, - Depth specifies the number of layers, not the depth of a 3D array. - – to enable creation of mipmapped cubemaps. If this flag is set, Width - must be equal to Height, and Depth must be six. If the CUDA_ARRAY3D_LAYERED flag is also set, - then Depth must be a multiple of six. - – to indicate that the CUDA mipmapped array will be used for - texture gather. Texture gather can only be performed on 2D CUDA mipmapped arrays. - Number of mipmap levels. This value is clamped to the range [1, 1 + floor(log2(max(width, height, depth)))] + The library that defines the variable. + The variable name as defined in the cu-file. - + - Creates a CUDA mipmapped array from an existing mipmap array handle. + For dispose - handle to wrap - Array format of the wrapped array. Cannot be gathered through CUDA API. - Number of channels of wrapped array. - + Dispose - + For IDisposable - + - Returns a CUDA array that represents a single mipmap level - of the CUDA mipmapped array. + UIntPtr to managed memory. - Mipmap level - + - Returns a CUDA array that represents a single mipmap level - of the CUDA mipmapped array. + CUdeviceptr to managed memory. - Mipmap level - + - Returns a CUDA array that represents a single mipmap level - of the CUDA mipmapped array. + Size in bytes - Mipmap level - + - Returns a CUDA array that represents a single mipmap level - of the CUDA mipmapped array. + Size in elements - Mipmap level - + - Returns the wrapped CUmipmappedArray + Access array per element. + index in elements + - + - Returns the wrapped CUDAArray3DDescriptor + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - Returns the Depth of the array + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + managed variable + newly allocated host variable with value from managed memory - + - Returns the Height of the array + The on which a pointer was allocated or registered - + - Returns the array width in elements + The describing the physical location of a pointer - + - Returns the array creation flags + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + - Returns the array format + The address at which a pointer's memory may be accessed on the host - + - Returns number of channels + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Synchronize every synchronous memory operation initiated on this region - + - Cuda occupancy from CudaOccupancy.h + A process-wide unique ID for an allocated memory region - + - mirror the type and spelling of cudaDeviceProp's members keep these alphabetized + Indicates if the pointer points to managed memory - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - define our own cudaOccFuncAttributes to stay consistent with the original header file + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - - - - - - - - - - - - - - - - - - - - - - - - - - cudaOccFuncAttributes - - - - Only the static part shared memory (without dynamic allocations) - - - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - - - - - Occupancy Error types - - - - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - input parameter is invalid + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - requested device is not supported in current implementation or device is invalid + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Function cache configurations + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - no preference for shared memory or L1 (default) + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - prefer larger shared memory and smaller L1 cache + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - prefer larger L1 cache and smaller shared memory + Enumerator class for CudaManagedMemory_int1 - + - prefer equal sized L1 cache and shared memory + + - + - Occupancy Limiting Factors + - + - occupancy limited due to warps available + - + - occupancy limited due to registers available + - + - occupancy limited due to shared memory available + + - + - occupancy limited due to blocks available + A variable located in managed memory. + Type: int2 - + - Partitioned global caching support + Creates a new CudaManagedMemory and allocates the memory on host/device. + In elements + - + - Partitioned global caching is not supported + Creates a new CudaManagedMemory from definition in cu-file. + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Partitioned global caching is supported + Creates a new CudaManagedMemory from definition in cu-file. + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - This is only needed for Pascal. This, and - all references / explanations for this, - should be removed from the header before - exporting to toolkit. + Creates a new CudaManagedMemory from definition in cu-file. + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Partitioned global caching option + Creates a new CudaManagedMemory from definition in cu-file. + The library that defines the variable. + The variable name as defined in the cu-file. - + - Disable partitioned global caching + For dispose - + - Prefer partitioned global caching + Dispose - + - Force partitioned global caching + For IDisposable + - + - Per function opt in maximum dynamic shared memory limit + UIntPtr to managed memory. - + - Default shmem limit + CUdeviceptr to managed memory. - + - Use the optin shmem limit + Size in bytes - + - Shared memory carveout configurations + Size in elements - + - no preference for shared memory or L1 (default) + Access array per element. + index in elements + - + - prefer maximum available shared memory, minimum L1 cache + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - prefer maximum available L1 cache, minimum shared memory + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + managed variable + newly allocated host variable with value from managed memory - + - prefer half of maximum available shared memory, with the rest as L1 cache + The on which a pointer was allocated or registered - + - + The describing the physical location of a pointer - + - Active Thread Blocks per Multiprocessor + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - - - - - - - - - - - - - - - - - - - - - - - + - define cudaOccDeviceState to include any device property needed to be passed - in future GPUs so that user interfaces don't change ; hence users are encouraged - to declare the struct zero in order to handle the assignments of any field - that might be added for later GPUs. + The address at which a pointer's memory may be accessed on the host - - - - - - - - *! - - - Align up shared memory based on compute major configurations - - - Shared memory based on the new carveoutConfig API introduced with Volta - - - Shared memory based on config requested by User - - - Return the per block shared memory limit based on function config - - - Partitioned global caching mode support - - + - Determine the maximum number of CTAs that can be run simultaneously per SM. - This is equivalent to the calculation done in the CUDA Occupancy Calculator - spreadsheet + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - - - - - - + - + Synchronize every synchronous memory operation initiated on this region - - - - - - - - + - + A process-wide unique ID for an allocated memory region - - - - - - - + - A function to convert from block size to dynamic shared memory size. - e.g.: - If no dynamic shared memory is used: x => 0 - If 4 bytes shared memory per thread is used: x = 4 * x + Indicates if the pointer points to managed memory - block size - size of dynamic shared memory - + - A CudaOccupancy exception is thrown if a CudaOccupancy API method call does not return 0 + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. - - - - - - + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: - - - - - - - + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. - - - - - + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. - - - - - - - Checks if value is zero. If value is zero, CudaOccupancyException is thrown. - - - - - + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: byte - - - - - Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc. - - In elements - In elements - - - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_int2 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: int3 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - + - Enumerator class for CudaPageLockedHostMemory2D_byte + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. - - - - + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: - - - - + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. - - - - + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar1 - - - - - Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc. - - In elements - In elements - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_int3 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: int4 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_uchar1 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar2 - - - - - Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc. - - In elements - In elements - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_int4 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: uint - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_uchar2 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar3 - - - - - Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_uint - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: uint1 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_uchar3 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar4 - - - - - Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc without flags. - - In elements - In elements + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_uint1 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: uint2 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_uchar4 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: sbyte - - - - - Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_uint2 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: uint3 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_sbyte - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char1 - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - Width including alignment in bytes - In elements - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose - - - - - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_uint3 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: uint4 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_char1 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char2 - - - - - Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc without flags. - - In elements - In elements + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - For dispose - - - - - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_uint4 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: long - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_char2 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char3 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - Width including alignment in bytes - In elements + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc. + Enumerator class for CudaManagedMemory_long - In elements - In elements - - + - For dispose + + - + - Dispose + - + - For IDisposable + - - + - Pointer to pinned host memory. + - + - Width in elements + + - + - Height in elements + A variable located in managed memory. + Type: long1 - + - Pitch in bytes + Creates a new CudaManagedMemory and allocates the memory on host/device. + In elements + - + - Size in bytes + Creates a new CudaManagedMemory from definition in cu-file. + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Type size in bytes + Creates a new CudaManagedMemory from definition in cu-file. + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Access array per element. + Creates a new CudaManagedMemory from definition in cu-file. - X-index in elements - Y-index in elements - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to 2D Array + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to 2D Array + For dispose - - + - Synchron copy 2D Array to host + Dispose - - + - Synchron copy 2D Array to host + For IDisposable - + - + - Synchron copy host to device + UIntPtr to managed memory. - - + - Synchron copy host to device + CUdeviceptr to managed memory. - - + - Synchron copy device to host + Size in bytes - - + - Synchron copy device to host + Size in elements - - + - Synchron Copy host to pitched device + Access array per element. - - + index in elements + - + - Synchron Copy host to pitched device + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - + - Synchron copy device to host + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Synchron copy device to host + The on which a pointer was allocated or registered - - + - Asynchron copy host to 2D Array + The describing the physical location of a pointer - - - + - Asynchron copy host to 2D Array + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron copy 2D Array to host + The address at which a pointer's memory may be accessed on the host - - - + - Asynchron copy 2D Array to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron Copy host to device + Synchronize every synchronous memory operation initiated on this region - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Asynchron Copy host to device + Indicates if the pointer points to managed memory - - - + - Asynchron copy device to host + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Asynchron Copy host to pitched device + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - - - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Asynchron Copy host to pitched device + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Asynchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - Device Pointer + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Passes back the flags that were specified when allocating the pinned host buffer + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for - + - Enumerator class for CudaPageLockedHostMemory2D_char3 + Enumerator class for CudaManagedMemory_long1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char4 + A variable located in managed memory. + Type: long2 - + - Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaManagedMemory and allocates the memory on host/device. - In elements - Width including alignment in bytes - In elements - + In elements + - + - Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - Width including alignment in bytes - In elements + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc without flags. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - In elements + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - In elements - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - + - Pointer to pinned host memory. + UIntPtr to managed memory. - + - Width in elements + CUdeviceptr to managed memory. - + - Height in elements + Size in bytes - + - Pitch in bytes + Size in elements - + - Size in bytes + Access array per element. + index in elements + - + - Type size in bytes + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - Access array per element. + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - X-index in elements - Y-index in elements - + managed variable + newly allocated host variable with value from managed memory - + - Synchron copy host to 2D Array + The on which a pointer was allocated or registered - - + - Synchron copy host to 2D Array + The describing the physical location of a pointer - - + - Synchron copy 2D Array to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - + - Synchron copy 2D Array to host + The address at which a pointer's memory may be accessed on the host - - + - Synchron copy host to device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - + - Synchron copy host to device + Synchronize every synchronous memory operation initiated on this region - - + - Synchron copy device to host + A process-wide unique ID for an allocated memory region - - + - Synchron copy device to host + Indicates if the pointer points to managed memory - - + - Synchron Copy host to pitched device + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Synchron Copy host to pitched device + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Synchron copy device to host + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Synchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy host to 2D Array + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy host to 2D Array + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Asynchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Asynchron copy 2D Array to host + Enumerator class for CudaManagedMemory_long2 - - - + - Asynchron Copy host to device + - - + - + - Asynchron copy device to host + - - - + - Asynchron Copy host to device + - - - + - Asynchron copy device to host + - - - + - Asynchron Copy host to pitched device + - - - + - + - Asynchron Copy host to pitched device + A variable located in managed memory. + Type: ulong - - - + - Asynchron copy device to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - - - + In elements + - + - Asynchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Creates a new CudaManagedMemory from definition in cu-file. - Device Pointer + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Passes back the flags that were specified when allocating the pinned host buffer + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Enumerator class for CudaPageLockedHostMemory2D_char4 + Creates a new CudaManagedMemory from definition in cu-file. + The library that defines the variable. + The variable name as defined in the cu-file. - + - + For dispose - - + - + Dispose - + - + For IDisposable + - + - + UIntPtr to managed memory. - + - + CUdeviceptr to managed memory. - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short + Size in bytes - + - Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc + Size in elements - In elements - Width including alignment in bytes - In elements - - + - Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc without flags. + Access array per element. - In elements - In elements + index in elements + - + - Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc. + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - In elements - In elements - - + - For dispose + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + managed variable + newly allocated host variable with value from managed memory - + - Dispose + The on which a pointer was allocated or registered - + - For IDisposable + The describing the physical location of a pointer - - + - Pointer to pinned host memory. + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + - Width in elements + The address at which a pointer's memory may be accessed on the host - + - Height in elements + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + - Pitch in bytes + Synchronize every synchronous memory operation initiated on this region - + - Size in bytes + A process-wide unique ID for an allocated memory region - + - Type size in bytes + Indicates if the pointer points to managed memory - + - Access array per element. + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - X-index in elements - Y-index in elements + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - + - Synchron copy host to 2D Array + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Synchron copy host to 2D Array + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Synchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Synchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Synchron copy host to device + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Synchron copy host to device + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Synchron copy device to host + Enumerator class for CudaManagedMemory_ulong - - + - Synchron copy device to host + - + - + - Synchron Copy host to pitched device + - - - + - Synchron Copy host to pitched device + - - + - Synchron copy device to host + - - - + - Synchron copy device to host + - + - + - Asynchron copy host to 2D Array + A variable located in managed memory. + Type: ulong1 - - - + - Asynchron copy host to 2D Array + Creates a new CudaManagedMemory and allocates the memory on host/device. - - + In elements + - + - Asynchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Asynchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Asynchron Copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Asynchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Asynchron Copy host to device + For dispose - - - + - Asynchron copy device to host + Dispose - - - + - Asynchron Copy host to pitched device + For IDisposable - - - + - + - Asynchron Copy host to pitched device + UIntPtr to managed memory. - - - + - Asynchron copy device to host + CUdeviceptr to managed memory. - - - - + - Asynchron copy device to host + Size in bytes - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Size in elements - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Access array per element. + index in elements - + - Enumerator class for CudaPageLockedHostMemory2D_short + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - + managed variable + newly allocated host variable with value from managed memory - + - + The on which a pointer was allocated or registered - + - + The describing the physical location of a pointer - + - + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + - + The address at which a pointer's memory may be accessed on the host - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short1 + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + - Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc + Synchronize every synchronous memory operation initiated on this region - In elements - Width including alignment in bytes - In elements - - + - Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + A process-wide unique ID for an allocated memory region - In elements - Width including alignment in bytes - In elements - + - Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc without flags. + Indicates if the pointer points to managed memory - In elements - In elements - + - Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Dispose + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - For IDisposable + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Width in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Height in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pitch in bytes + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Size in bytes + Enumerator class for CudaManagedMemory_ulong1 - + - Type size in bytes + + - + - Access array per element. + - X-index in elements - Y-index in elements - - + - Synchron copy host to 2D Array + - - + - Synchron copy host to 2D Array + - - + - Synchron copy 2D Array to host + - + - + - Synchron copy 2D Array to host + A variable located in managed memory. + Type: ulong2 - - + - Synchron copy host to device + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron Copy host to pitched device + Creates a new CudaManagedMemory from definition in cu-file. - - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron Copy host to pitched device + For dispose - - + - Synchron copy device to host + Dispose - - - + - Synchron copy device to host + For IDisposable - + - + - Asynchron copy host to 2D Array + UIntPtr to managed memory. - - - + - Asynchron copy host to 2D Array + CUdeviceptr to managed memory. - - - + - Asynchron copy 2D Array to host + Size in bytes - - - + - Asynchron copy 2D Array to host + Size in elements - - - + - Asynchron Copy host to device + Access array per element. - - + index in elements + - + - Asynchron copy device to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to pitched device + The describing the physical location of a pointer - - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron copy device to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Synchronize every synchronous memory operation initiated on this region - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + A process-wide unique ID for an allocated memory region - - + - Enumerator class for CudaPageLockedHostMemory2D_short1 + Indicates if the pointer points to managed memory - + - + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short2 - - - - - Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc. - - In elements - In elements - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_ulong2 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: float - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_short2 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short3 - - - - - Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc. - - In elements - In elements - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_float - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: float1 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_short3 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short4 - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_float1 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: float2 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_short4 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort - - - - - Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc without flags. - - In elements - In elements + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_float2 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: float3 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_ushort - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort1 - - - - - Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose - - - - - Dispose - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - For IDisposable - - - - - - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Width in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Height in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pitch in bytes + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Size in bytes + Enumerator class for CudaManagedMemory_float3 - + - Type size in bytes + + - + - Access array per element. + - X-index in elements - Y-index in elements - - + - Synchron copy host to 2D Array + - - + - Synchron copy host to 2D Array + - - + - Synchron copy 2D Array to host + - + - + - Synchron copy 2D Array to host + A variable located in managed memory. + Type: float4 - - + - Synchron copy host to device + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron Copy host to pitched device + Creates a new CudaManagedMemory from definition in cu-file. - - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron Copy host to pitched device + For dispose - - + - Synchron copy device to host + Dispose - - - + - Synchron copy device to host + For IDisposable - + - + - Asynchron copy host to 2D Array + UIntPtr to managed memory. - - - + - Asynchron copy host to 2D Array + CUdeviceptr to managed memory. - - - + - Asynchron copy 2D Array to host + Size in bytes - - - + - Asynchron copy 2D Array to host + Size in elements - - - + - Asynchron Copy host to device + Access array per element. - - + index in elements + - + - Asynchron copy device to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to pitched device + The describing the physical location of a pointer - - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron copy device to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Synchronize every synchronous memory operation initiated on this region - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + A process-wide unique ID for an allocated memory region - - + - Enumerator class for CudaPageLockedHostMemory2D_ushort1 + Indicates if the pointer points to managed memory - + - + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort2 + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose - - - - - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_float4 - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: double - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_ushort2 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort3 - - - - - Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose - - - - - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Width in elements + Enumerator class for CudaManagedMemory_double - + - Height in elements + + - + - Pitch in bytes + - + - Size in bytes + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - - Synchron copy host to 2D Array - - - - + - Synchron copy host to 2D Array + A variable located in managed memory. + Type: double1 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + For dispose - - + - Synchron Copy host to pitched device + Dispose - - - + - Synchron Copy host to pitched device + For IDisposable - + - + - Synchron copy device to host + UIntPtr to managed memory. - - - + - Synchron copy device to host + CUdeviceptr to managed memory. - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy host to 2D Array + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron copy 2D Array to host + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron Copy host to device + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Asynchron Copy host to device + The describing the physical location of a pointer - - - + - Asynchron copy device to host + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - - + - Asynchron Copy host to pitched device + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Indicates if the pointer points to managed memory - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - - - Enumerator class for CudaPageLockedHostMemory2D_ushort3 - - - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - - - - - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort4 - - - - - Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc - - In elements - Width including alignment in bytes - In elements - - - - - Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. - - In elements - Width including alignment in bytes - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc without flags. - - In elements - In elements - - - - Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc. - - In elements - In elements - - - - - For dispose - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - For IDisposable - - - - - - Pointer to pinned host memory. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Width in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Height in elements + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Pitch in bytes + Enumerator class for CudaManagedMemory_double1 - + - Size in bytes + + - + - Type size in bytes + - + - Access array per element. + - X-index in elements - Y-index in elements - - + - Synchron copy host to 2D Array + - - + - Synchron copy host to 2D Array + - + - + - Synchron copy 2D Array to host + A variable located in managed memory. + Type: double2 - - + - Synchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - + In elements + - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron Copy host to pitched device + For dispose - - - + - Synchron Copy host to pitched device + Dispose - - + - Synchron copy device to host + For IDisposable - - + - + - Synchron copy device to host + UIntPtr to managed memory. - - + - Asynchron copy host to 2D Array + CUdeviceptr to managed memory. - - - + - Asynchron copy host to 2D Array + Size in bytes - - - + - Asynchron copy 2D Array to host + Size in elements - - - + - Asynchron copy 2D Array to host + Access array per element. - - + index in elements + - + - Asynchron Copy host to device + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron copy device to host + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron Copy host to device + The on which a pointer was allocated or registered - - - + - Asynchron copy device to host + The describing the physical location of a pointer - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - - + - Asynchron Copy host to pitched device + The address at which a pointer's memory may be accessed on the host - - - + - Asynchron copy device to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - - + - Asynchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + A process-wide unique ID for an allocated memory region - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Indicates if the pointer points to managed memory - - + - Enumerator class for CudaPageLockedHostMemory2D_ushort4 + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. - - - - - + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. - - - - + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - Width including alignment in bytes - In elements - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - Width including alignment in bytes - In elements + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc. + Enumerator class for CudaManagedMemory_double2 - In elements - In elements - - + - For dispose + + - + - Dispose + - + - For IDisposable + - - + - Pointer to pinned host memory. + - + - Width in elements + + - + - Height in elements + A variable located in managed memory. + Type: cuDoubleComplex - + - Pitch in bytes + Creates a new CudaManagedMemory and allocates the memory on host/device. + In elements + - + - Size in bytes + Creates a new CudaManagedMemory from definition in cu-file. + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Type size in bytes + Creates a new CudaManagedMemory from definition in cu-file. + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Access array per element. + Creates a new CudaManagedMemory from definition in cu-file. - X-index in elements - Y-index in elements - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy host to 2D Array + Creates a new CudaManagedMemory from definition in cu-file. - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy host to 2D Array + For dispose - - + - Synchron copy 2D Array to host + Dispose - - + - Synchron copy 2D Array to host + For IDisposable - + - + - Synchron copy host to device + UIntPtr to managed memory. - - + - Synchron copy host to device + CUdeviceptr to managed memory. - - + - Synchron copy device to host + Size in bytes - - + - Synchron copy device to host + Size in elements - - + - Synchron Copy host to pitched device + Access array per element. - - + index in elements + - + - Synchron Copy host to pitched device + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - + - Synchron copy device to host + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - + managed variable + newly allocated host variable with value from managed memory - + - Synchron copy device to host + The on which a pointer was allocated or registered - - + - Asynchron copy host to 2D Array + The describing the physical location of a pointer - - - + - Asynchron copy host to 2D Array + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - - + - Asynchron copy 2D Array to host + The address at which a pointer's memory may be accessed on the host - - - + - Asynchron copy 2D Array to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - - + - Asynchron Copy host to device + Synchronize every synchronous memory operation initiated on this region - - - + - Asynchron copy device to host + A process-wide unique ID for an allocated memory region - - - + - Asynchron Copy host to device + Indicates if the pointer points to managed memory - - - + - Asynchron copy device to host + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Asynchron Copy host to pitched device + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - - - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Asynchron Copy host to pitched device + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Asynchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - Device Pointer + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Passes back the flags that were specified when allocating the pinned host buffer + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for - + - Enumerator class for CudaPageLockedHostMemory2D_int + Enumerator class for CudaManagedMemory_cuDoubleComplex - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int1 + A variable located in managed memory. + Type: cuDoubleReal - + - Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaManagedMemory and allocates the memory on host/device. - In elements - Width including alignment in bytes - In elements - + In elements + - + - Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - Width including alignment in bytes - In elements + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc without flags. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - In elements + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc. + Creates a new CudaManagedMemory from definition in cu-file. - In elements - In elements - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + + + Creates a new CudaManagedMemory from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + For dispose - + Dispose - + For IDisposable - - - Pointer to pinned host memory. - - - - - Width in elements - - - + - Height in elements + UIntPtr to managed memory. - + - Pitch in bytes + CUdeviceptr to managed memory. - + Size in bytes - + - Type size in bytes + Size in elements - + Access array per element. - X-index in elements - Y-index in elements + index in elements - + - Synchron copy host to 2D Array + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - + - Synchron copy host to 2D Array + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - + managed variable + newly allocated host variable with value from managed memory - + - Synchron copy 2D Array to host + The on which a pointer was allocated or registered - - + - Synchron copy 2D Array to host + The describing the physical location of a pointer - - + - Synchron copy host to device + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - + - Synchron copy host to device + The address at which a pointer's memory may be accessed on the host - - + - Synchron copy device to host + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - + - Synchron copy device to host + Synchronize every synchronous memory operation initiated on this region - - + - Synchron Copy host to pitched device + A process-wide unique ID for an allocated memory region - - - + - Synchron Copy host to pitched device + Indicates if the pointer points to managed memory - - + - Synchron copy device to host + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Synchron copy device to host + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Asynchron copy host to 2D Array + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Asynchron copy host to 2D Array + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Asynchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Asynchron Copy host to device + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Asynchron copy device to host + Enumerator class for CudaManagedMemory_cuDoubleReal - - - + - Asynchron Copy host to device + - - + - + - Asynchron copy device to host + - - - + - Asynchron Copy host to pitched device + - - - - + - Asynchron Copy host to pitched device + - - - + - Asynchron copy device to host + - - - + - + - Asynchron copy device to host + A variable located in managed memory. + Type: cuFloatComplex - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Creates a new CudaManagedMemory and allocates the memory on host/device. - Device Pointer + In elements + - + - Passes back the flags that were specified when allocating the pinned host buffer + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Enumerator class for CudaPageLockedHostMemory2D_int1 + Creates a new CudaManagedMemory from definition in cu-file. + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - + Creates a new CudaManagedMemory from definition in cu-file. + The library that defines the variable. + The variable name as defined in the cu-file. - + - + For dispose - + - + Dispose - + - + For IDisposable - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int2 + UIntPtr to managed memory. - + - Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc + CUdeviceptr to managed memory. - In elements - Width including alignment in bytes - In elements - - + - Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Size in bytes - In elements - Width including alignment in bytes - In elements - + - Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc without flags. + Size in elements - In elements - In elements - + - Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc. + Access array per element. - In elements - In elements - + index in elements + - + - For dispose + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + - Dispose + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + managed variable + newly allocated host variable with value from managed memory - + - For IDisposable + The on which a pointer was allocated or registered - - + - Pointer to pinned host memory. + The describing the physical location of a pointer - + - Width in elements + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - + - Height in elements + The address at which a pointer's memory may be accessed on the host - + - Pitch in bytes + A pair of tokens for use with the nv-p2p.h Linux kernel interface - + - Size in bytes + Synchronize every synchronous memory operation initiated on this region - + - Type size in bytes + A process-wide unique ID for an allocated memory region - + - Access array per element. + Indicates if the pointer points to managed memory - X-index in elements - Y-index in elements - - + - Synchron copy host to 2D Array + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + - + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Synchron copy host to 2D Array + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Synchron copy 2D Array to host + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Synchron copy 2D Array to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Synchron copy host to device + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Synchron copy host to device + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Synchron copy device to host + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Synchron copy device to host + Enumerator class for CudaManagedMemory_cuFloatComplex - - + - Synchron Copy host to pitched device + - - + - + - Synchron Copy host to pitched device + - - + - Synchron copy device to host + - - - + - Synchron copy device to host + - - + - Asynchron copy host to 2D Array + - - + - + - Asynchron copy host to 2D Array + A variable located in managed memory. + Type: cuFloatReal - - - + - Asynchron copy 2D Array to host + Creates a new CudaManagedMemory and allocates the memory on host/device. - - + In elements + - + - Asynchron copy 2D Array to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Asynchron Copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Asynchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Asynchron Copy host to device + Creates a new CudaManagedMemory from definition in cu-file. - - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Asynchron copy device to host + For dispose - - - + - Asynchron Copy host to pitched device + Dispose - - - - + - Asynchron Copy host to pitched device + For IDisposable - - + - + - Asynchron copy device to host + UIntPtr to managed memory. - - - - + - Asynchron copy device to host + CUdeviceptr to managed memory. - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Size in bytes - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Size in elements - - + - Enumerator class for CudaPageLockedHostMemory2D_int2 + Access array per element. + index in elements + - + - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - + - + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. + managed variable + newly allocated host variable with value from managed memory - + - + The on which a pointer was allocated or registered - + - + The describing the physical location of a pointer - + - + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int3 + The address at which a pointer's memory may be accessed on the host - + - Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc + A pair of tokens for use with the nv-p2p.h Linux kernel interface - In elements - Width including alignment in bytes - In elements - - + - Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Synchronize every synchronous memory operation initiated on this region - In elements - Width including alignment in bytes - In elements - + - Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc without flags. + A process-wide unique ID for an allocated memory region - In elements - In elements - + - Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc. + Indicates if the pointer points to managed memory - In elements - In elements - - + - For dispose - - - - - Dispose - - - - - For IDisposable - - - - - - Pointer to pinned host memory. - - - - - Width in elements + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of + - + - Height in elements + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Pitch in bytes + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Size in bytes + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Type size in bytes + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - Access array per element. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - X-index in elements - Y-index in elements + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for - + - Synchron copy host to 2D Array + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - Synchron copy host to 2D Array + Enumerator class for CudaManagedMemory_cuFloatReal - - + - Synchron copy 2D Array to host + - + - + - Synchron copy 2D Array to host + - - + - Synchron copy host to device + - - + - Synchron copy host to device + - - + - Synchron copy device to host + - + - + - Synchron copy device to host + A variable located in managed memory. + Type: dim3 - - + - Synchron Copy host to pitched device + Creates a new CudaManagedMemory and allocates the memory on host/device. - - + In elements + - + - Synchron Copy host to pitched device + Creates a new CudaManagedMemory from definition in cu-file. - + The module where the variable is defined in. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - - + The kernel which module defines the variable. + The variable name as defined in the cu-file. - + - Synchron copy device to host + Creates a new CudaManagedMemory from definition in cu-file. - + The library where the variable is defined in. + The variable name as defined in the cu-file. - + - Asynchron copy host to 2D Array + Creates a new CudaManagedMemory from definition in cu-file. - - + The library that defines the variable. + The variable name as defined in the cu-file. - + - Asynchron copy host to 2D Array + For dispose - - - + - Asynchron copy 2D Array to host + Dispose - - - + - Asynchron copy 2D Array to host + For IDisposable - - + - + - Asynchron Copy host to device + UIntPtr to managed memory. - - - + - Asynchron copy device to host + CUdeviceptr to managed memory. - - - + - Asynchron Copy host to device + Size in bytes - - - + - Asynchron copy device to host + Size in elements - - - + - Asynchron Copy host to pitched device + Access array per element. - - - + index in elements + - + - Asynchron Copy host to pitched device + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Asynchron copy device to host + Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted. - - - + managed variable + newly allocated host variable with value from managed memory - + - Asynchron copy device to host + The on which a pointer was allocated or registered - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + The describing the physical location of a pointer - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + The address at which a pointer's memory may be accessed on the device + Except in the exceptional disjoint addressing cases, the value returned will equal the input value. - - + - Enumerator class for CudaPageLockedHostMemory2D_int3 + The address at which a pointer's memory may be accessed on the host - + - + A pair of tokens for use with the nv-p2p.h Linux kernel interface - - + - + Synchronize every synchronous memory operation initiated on this region - + - + A process-wide unique ID for an allocated memory region - + - + Indicates if the pointer points to managed memory - + - + Attach memory to a stream asynchronously + + Enqueues an operation in hStream to specify stream association of + length bytes of memory starting from dptr. This function is a + stream-ordered operation, meaning that it is dependent on, and will + only take effect when, previous work in stream has completed. Any + previous association is automatically replaced. + + dptr must point to an address within managed memory space declared + using the __managed__ keyword or allocated with cuMemAllocManaged. + + length must be zero, to indicate that the entire allocation's + stream association is being changed. Currently, it's not possible + to change stream association for a portion of an allocation. + + The stream association is specified using flags which must be + one of . + If the flag is specified, the memory can be accessed + by any stream on any device. + If the flag is specified, the program makes a guarantee + that it won't access the memory on the device from any stream. + If the flag is specified, the program makes a guarantee + that it will only access the memory on the device from hStream. It is illegal + to attach singly to the NULL stream, because the NULL stream is a virtual global + stream and not a specific stream. An error will be returned in this case. + + When memory is associated with a single stream, the Unified Memory system will + allow CPU access to this memory region so long as all operations in hStream + have completed, regardless of whether other streams are active. In effect, + this constrains exclusive ownership of the managed memory region by + an active GPU to per-stream activity instead of whole-GPU activity. + + Accessing memory on the device from streams that are not associated with + it will produce undefined results. No error checking is performed by the + Unified Memory system to ensure that kernels launched into other streams + do not access this region. + + It is a program's responsibility to order calls to + via events, synchronization or other means to ensure legal access to memory + at all times. Data visibility and coherency will be changed appropriately + for all kernels which follow a stream-association change. + + If hStream is destroyed while data is associated with it, the association is + removed and the association reverts to the default visibility of the allocation + as specified at cuMemAllocManaged. For __managed__ variables, the default + association is always . Note that destroying a stream is an + asynchronous operation, and as a result, the change to default association won't + happen until all work in the stream has completed. + + Stream in which to enqueue the attach operation + Length of memory (must be zero) + Must be one of - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int4 - - - - - Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - In elements - Width including alignment in bytes - In elements - + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - - - Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + - In elements - Width including alignment in bytes - In elements + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + - Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc without flags. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for - + - Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - In elements - In elements - + managed memory variable + Advice to be applied for the specified memory range + Device to apply the advice for - + - For dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Dispose + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. + Pointer to memory to set the advice for + Advice to be applied for the specified memory range + location to apply the advice for + - + - For IDisposable + Enumerator class for CudaManagedMemory_dim3 - - + - Pointer to pinned host memory. + + - + - Width in elements + - + - Height in elements + - + - Pitch in bytes + - + - Size in bytes + + - + - Type size in bytes + CudaMemoryPool - + - Access array per element. + Creates a new CudaMemoryPool. - X-index in elements - Y-index in elements - + - + - Synchron copy host to 2D Array + imports a memory pool from a shared handle. + Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. + note Imported memory pools do not support creating new allocations. As such imported memory pools + may not be used in cuDeviceSetMemPool or ::cuMemAllocFromPoolAsync calls. - + OS handle of the pool to open + The type of handle being imported + must be 0 - + - Synchron copy host to 2D Array + Gets the current or default memory pool of the CUdevice. - + The device to the memory pool from + Get the default or the current memory pool - + - Synchron copy 2D Array to host + For dispose - - + - Synchron copy 2D Array to host + Dispose - - + - Synchron copy host to device + For IDisposable - + - + - Synchron copy host to device + Tries to release memory back to the OS + Releases memory back to the OS until the pool contains fewer than minBytesToKeep + reserved bytes, or there is no more memory that the allocator can safely release. + The allocator cannot release OS allocations that back outstanding asynchronous allocations. + The OS allocations may happen at different granularity from the user allocations. + + note: Allocations that have not been freed count as outstanding. + note: Allocations that have been asynchronously freed but whose completion has + not been observed on the host (eg.by a synchronize) can count as outstanding. - + If the pool has less than minBytesToKeep reserved, + the TrimTo operation is a no-op.Otherwise the pool will be guaranteed to have at least minBytesToKeep bytes reserved after the operation. - + - Synchron copy device to host + Import a memory pool allocation from another process. + Returns in \p ptr_out a pointer to the imported memory. + The imported memory must not be accessed before the allocation operation completes + in the exporting process.The imported memory must be freed from all importing processes before + being freed in the exporting process.The pointer may be freed with cuMemFree + or cuMemFreeAsync.If cuMemFreeAsync is used, the free must be completed + on the importing process before the free operation on the exporting process. + note The cuMemFreeAsync api may be used in the exporting process before + the cuMemFreeAsync operation completes in its stream as long as the + cuMemFreeAsync in the exporting process specifies a stream with + a stream dependency on the importing process's cuMemFreeAsync. - - - + + + + + + + Allocates memory from a specified pool with stream ordered semantics. + Inserts an allocation operation into \p hStream. + A pointer to the allocated memory is returned immediately in *dptr. + The allocation must not be accessed until the the allocation operation completes. + The allocation comes from the specified memory pool. + note + - The specified memory pool may be from a device different than that of the specified \p hStream. + - Basic stream ordering allows future work submitted into the same stream to use the allocation. + Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + operation completes before work submitted in a separate stream runs. + + Number of bytes to allocate + The stream establishing the stream ordering semantic + + + + Allocates memory from a specified pool with stream ordered semantics. + Inserts an allocation operation into \p hStream. + A pointer to the allocated memory is returned immediately in *dptr. + The allocation must not be accessed until the the allocation operation completes. + The allocation comes from the specified memory pool. + note + - The specified memory pool may be from a device different than that of the specified \p hStream. + - Basic stream ordering allows future work submitted into the same stream to use the allocation. + Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + operation completes before work submitted in a separate stream runs. + + Number of bytes to allocate + The stream establishing the stream ordering semantic + + + + Returns the accessibility of a pool from a device + Returns the accessibility of the pool's memory from the specified location. + + the location accessing the pool + + + + Controls visibility of pools between devices + + + + + Exports a memory pool to the requested handle type. + Given an IPC capable mempool, create an OS handle to share the pool with another process. + A recipient process can convert the shareable handle into a mempool with::cuMemPoolImportFromShareableHandle. + Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. + The implementation of what the shareable handle is and how it can be transferred is defined by the requested + handle type. + note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. + + the type of handle to create + must be 0 + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The attribute to modify + Pointer to the value to assign + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The attribute to modify + Pointer to the value to assign + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The attribute to modify + Pointer to the value to assign + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The attribute to modify + Pointer to the value to assign + + - Synchron copy device to host + Returns the wrapped CUarray - - + - Synchron Copy host to pitched device + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - + - Synchron Copy host to pitched device + Number of channels in array - - + - Synchron copy device to host + One channel, e.g. float1, int1, float, int - - - + - Synchron copy device to host + Two channels, e.g. float2, int2 - - + - Asynchron copy host to 2D Array + Four channels, e.g. float4, int4 - - - + - Asynchron copy host to 2D Array + A mipmapped Cuda array - - - + - Asynchron copy 2D Array to host + Creates a CUDA mipmapped array according to descriptor. + Width, Height, and Depth are the width, height, and depth of the CUDA array (in elements); the following + types of CUDA arrays can be allocated: + – A 1D mipmapped array is allocated if Height and Depth extents are both zero. + – A 2D mipmapped array is allocated if only Depth extent is zero. + – A 3D mipmapped array is allocated if all three extents are non-zero. + – A 1D layered CUDA mipmapped array is allocated if only Height is zero and the + flag is set. Each layer is a 1D array. The number of layers is determined by the depth extent. + – A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and the + flag is set. Each layer is a 2D array. The number of layers is determined by the depth extent. + – A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + flag is set. Width must be equal to Height, and Depth must be six. A + cubemap is a special type of 2D layered CUDA array, where the six layers represent the six faces of a + cube. The order of the six layers in memory is the same as that listed in CUarray_cubemap_face. + – A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, + and flags are set. Width must be equal + to Height, and Depth must be a multiple of six. A cubemap layered CUDA array is a special type of + 2D layered CUDA array that consists of a collection of cubemaps. The first six layers represent the first + cubemap, the next six layers form the second cubemap, and so on. + Flags may be set to: + – to enable creation of layered CUDA mipmapped arrays. If this flag is set, + Depth specifies the number of layers, not the depth of a 3D array. + – to enable creation of mipmapped cubemaps. If this flag is set, Width + must be equal to Height, and Depth must be six. If the CUDA_ARRAY3D_LAYERED flag is also set, + then Depth must be a multiple of six. + – to indicate that the CUDA mipmapped array will be used for + texture gather. Texture gather can only be performed on 2D CUDA mipmapped arrays. - - + mipmapped array descriptor + Number of mipmap levels. This value is clamped to the range [1, 1 + floor(log2(max(width, height, depth)))] - + - Asynchron copy 2D Array to host + Creates a CUDA mipmapped array according to descriptor. + Width, Height, and Depth are the width, height, and depth of the CUDA array (in elements); the following + types of CUDA arrays can be allocated: + – A 1D mipmapped array is allocated if Height and Depth extents are both zero. + – A 2D mipmapped array is allocated if only Depth extent is zero. + – A 3D mipmapped array is allocated if all three extents are non-zero. + – A 1D layered CUDA mipmapped array is allocated if only Height is zero and the + flag is set. Each layer is a 1D array. The number of layers is determined by the depth extent. + – A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and the + flag is set. Each layer is a 2D array. The number of layers is determined by the depth extent. + – A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + flag is set. Width must be equal to Height, and Depth must be six. A + cubemap is a special type of 2D layered CUDA array, where the six layers represent the six faces of a + cube. The order of the six layers in memory is the same as that listed in CUarray_cubemap_face. + – A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, + and flags are set. Width must be equal + to Height, and Depth must be a multiple of six. A cubemap layered CUDA array is a special type of + 2D layered CUDA array that consists of a collection of cubemaps. The first six layers represent the first + cubemap, the next six layers form the second cubemap, and so on. - - + Array format + Array width. See general description. + Array height. See general description. + Array depth or layer count. See general description. + number of channels + Flags may be set to: + – to enable creation of layered CUDA mipmapped arrays. If this flag is set, + Depth specifies the number of layers, not the depth of a 3D array. + – to enable creation of mipmapped cubemaps. If this flag is set, Width + must be equal to Height, and Depth must be six. If the CUDA_ARRAY3D_LAYERED flag is also set, + then Depth must be a multiple of six. + – to indicate that the CUDA mipmapped array will be used for + texture gather. Texture gather can only be performed on 2D CUDA mipmapped arrays. + Number of mipmap levels. This value is clamped to the range [1, 1 + floor(log2(max(width, height, depth)))] - + - Asynchron Copy host to device + Creates a CUDA mipmapped array from an existing mipmap array handle. - - + handle to wrap + Array format of the wrapped array. Cannot be gathered through CUDA API. + Number of channels of wrapped array. - + - Asynchron copy device to host + Dispose - - - + - Asynchron Copy host to device + For IDisposable - - + - + - Asynchron copy device to host + Returns a CUDA array that represents a single mipmap level + of the CUDA mipmapped array. - - + Mipmap level - + - Asynchron Copy host to pitched device + Returns a CUDA array that represents a single mipmap level + of the CUDA mipmapped array. - - - + Mipmap level - + - Asynchron Copy host to pitched device + Returns a CUDA array that represents a single mipmap level + of the CUDA mipmapped array. - - + Mipmap level - + - Asynchron copy device to host + Returns a CUDA array that represents a single mipmap level + of the CUDA mipmapped array. - - - + Mipmap level - + - Asynchron copy device to host + Returns the layout properties of a sparse CUDA mipmapped array + Returns the sparse array layout properties in \p sparseProperties + If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + ::CUDA_ERROR_INVALID_VALUE will be returned. + For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the + size of the mip tail region.The mip tail region includes all mip levels whose width, height or depth + is less than that of the tile. + For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. + Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. + The returned value of::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Returns the memory requirements of a CUDA array - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Returns the wrapped CUmipmappedArray - - + - Enumerator class for CudaPageLockedHostMemory2D_int4 + Returns the wrapped CUDAArray3DDescriptor - + - + Returns the Depth of the array - - + - + Returns the Height of the array - + - + Returns the array width in elements - + - + Returns the array creation flags - + - + Returns the array format - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint + Returns number of channels - + - Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - In elements - Width including alignment in bytes - In elements - - + - Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Cuda occupancy from CudaOccupancy.h - In elements - Width including alignment in bytes - In elements - + - Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc without flags. + mirror the type and spelling of cudaDeviceProp's members keep these alphabetized - In elements - In elements - - - Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc. - - In elements - In elements - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - For dispose + define our own cudaOccFuncAttributes to stay consistent with the original header file - + + + + + + + + + + + + + + + + + + + - Dispose + Number of block barriers used (default to 1) - + - For IDisposable + - - + - Pointer to pinned host memory. + cudaOccFuncAttributes + + + Only the static part shared memory (without dynamic allocations) + + + - + - Width in elements + + - + - Height in elements + Occupancy Error types - + + + + - Pitch in bytes + input parameter is invalid - + - Size in bytes + requested device is not supported in current implementation or device is invalid - + - Type size in bytes + Function cache configurations - + - Access array per element. + no preference for shared memory or L1 (default) - X-index in elements - Y-index in elements - - + - Synchron copy host to 2D Array + prefer larger shared memory and smaller L1 cache - - + - Synchron copy host to 2D Array + prefer larger L1 cache and smaller shared memory - - + - Synchron copy 2D Array to host + prefer equal sized L1 cache and shared memory - - + - Synchron copy 2D Array to host + Occupancy Limiting Factors - - + - Synchron copy host to device + occupancy limited due to warps available - - + - Synchron copy host to device + occupancy limited due to registers available - - + - Synchron copy device to host + occupancy limited due to shared memory available - - + - Synchron copy device to host + occupancy limited due to blocks available - - + - Synchron Copy host to pitched device + occupancy limited due to barrier available - - - + - Synchron Copy host to pitched device + Partitioned global caching support - - + - Synchron copy device to host + Partitioned global caching is not supported - - - + - Synchron copy device to host + Partitioned global caching is supported - - + - Asynchron copy host to 2D Array + Partitioned global caching option - - - + - Asynchron copy host to 2D Array + Disable partitioned global caching - - - + - Asynchron copy 2D Array to host + Prefer partitioned global caching - - - + - Asynchron copy 2D Array to host + Force partitioned global caching - - - + - Asynchron Copy host to device + Per function opt in maximum dynamic shared memory limit - - - + - Asynchron copy device to host + Default shmem limit - - - + - Asynchron Copy host to device + Use the optin shmem limit - - - + - Asynchron copy device to host + Shared memory carveout configurations - - - + - Asynchron Copy host to pitched device + no preference for shared memory or L1 (default) - - - - + - Asynchron Copy host to pitched device + prefer maximum available shared memory, minimum L1 cache - - - + - Asynchron copy device to host + prefer maximum available L1 cache, minimum shared memory - - - - + - Asynchron copy device to host + prefer half of maximum available shared memory, with the rest as L1 cache - - - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + - Device Pointer - + - Passes back the flags that were specified when allocating the pinned host buffer + Active Thread Blocks per Multiprocessor + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + define cudaOccDeviceState to include any device property needed to be passed + in future GPUs so that user interfaces don't change ; hence users are encouraged + to declare the struct zero in order to handle the assignments of any field + that might be added for later GPUs. + + + + + + + + + + *! + + + Align up shared memory based on compute major configurations + + + Shared memory based on the new carveoutConfig API introduced with Volta + + + Shared memory based on config requested by User + + + Return the per block shared memory limit based on function config + + + Partitioned global caching mode support + + + + Determine the maximum number of CTAs that can be run simultaneously per SM. + This is equivalent to the calculation done in the CUDA Occupancy Calculator + spreadsheet + + + + + + - + - Enumerator class for CudaPageLockedHostMemory2D_uint + The CUDA dynamic shared memory calculator computes the maximum size of + per-block dynamic shared memory if we want to place numBlocks blocks + on an SM. + Returns maximum size of dynamic shared memory to allow numBlocks blocks per SM. + + + + + + - + - + + + + + + + - + + + + + + + + + + + + + + A function to convert from block size to dynamic shared memory size. + e.g.: + If no dynamic shared memory is used: x => 0 + If 4 bytes shared memory per thread is used: x = 4 * x + + block size + size of dynamic shared memory + + + + A CudaOccupancy exception is thrown if a CudaOccupancy API method call does not return 0 + + + - + + + - + + - + + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + Checks if value is zero. If value is zero, CudaOccupancyException is thrown. + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint1 + Type: byte - + - Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -35805,137 +49012,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -35943,14 +49150,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -35958,144 +49165,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_uint1 + Enumerator class for CudaPageLockedHostMemory2D_byte - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint2 + Type: uchar1 - + - Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -36103,137 +49310,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -36241,14 +49448,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -36256,144 +49463,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_uint2 + Enumerator class for CudaPageLockedHostMemory2D_uchar1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint3 + Type: uchar2 - + - Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -36401,137 +49608,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -36539,14 +49746,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -36554,144 +49761,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_uint3 + Enumerator class for CudaPageLockedHostMemory2D_uchar2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint4 + Type: uchar3 - + - Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -36699,137 +49906,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -36837,14 +50044,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -36852,144 +50059,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_uint4 + Enumerator class for CudaPageLockedHostMemory2D_uchar3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long + Type: uchar4 - + - Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -36997,137 +50204,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -37135,14 +50342,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -37150,144 +50357,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_long + Enumerator class for CudaPageLockedHostMemory2D_uchar4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long1 + Type: sbyte - + - Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -37295,137 +50502,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -37433,14 +50640,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -37448,144 +50655,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_long1 + Enumerator class for CudaPageLockedHostMemory2D_sbyte - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long2 + Type: char1 - + - Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -37593,137 +50800,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -37731,14 +50938,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -37746,144 +50953,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_long2 + Enumerator class for CudaPageLockedHostMemory2D_char1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong + Type: char2 - + - Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -37891,137 +51098,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -38029,14 +51236,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -38044,144 +51251,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_ulong + Enumerator class for CudaPageLockedHostMemory2D_char2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong1 + Type: char3 - + - Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -38189,137 +51396,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -38327,14 +51534,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -38342,144 +51549,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_ulong1 + Enumerator class for CudaPageLockedHostMemory2D_char3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong2 + Type: char4 - + - Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -38487,137 +51694,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -38625,14 +51832,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -38640,144 +51847,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_ulong2 + Enumerator class for CudaPageLockedHostMemory2D_char4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float + Type: short - + - Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -38785,137 +51992,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -38923,14 +52130,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -38938,144 +52145,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_float + Enumerator class for CudaPageLockedHostMemory2D_short - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float1 + Type: short1 - + - Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -39083,137 +52290,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -39221,14 +52428,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -39236,144 +52443,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_float1 + Enumerator class for CudaPageLockedHostMemory2D_short1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float2 + Type: short2 - + - Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -39381,137 +52588,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -39519,14 +52726,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -39534,144 +52741,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_float2 + Enumerator class for CudaPageLockedHostMemory2D_short2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float3 + Type: short3 - + - Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -39679,137 +52886,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -39817,14 +53024,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -39832,144 +53039,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_float3 + Enumerator class for CudaPageLockedHostMemory2D_short3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float4 + Type: short4 - + - Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -39977,137 +53184,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -40115,14 +53322,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -40130,144 +53337,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_float4 + Enumerator class for CudaPageLockedHostMemory2D_short4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double + Type: ushort - + - Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -40275,137 +53482,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -40413,14 +53620,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -40428,144 +53635,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_double + Enumerator class for CudaPageLockedHostMemory2D_ushort - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double1 + Type: ushort1 - + - Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -40573,137 +53780,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -40711,14 +53918,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -40726,144 +53933,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_double1 + Enumerator class for CudaPageLockedHostMemory2D_ushort1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double2 + Type: ushort2 - + - Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -40871,137 +54078,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -41009,14 +54216,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -41024,144 +54231,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_double2 + Enumerator class for CudaPageLockedHostMemory2D_ushort2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuDoubleComplex + Type: ushort3 - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -41169,137 +54376,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -41307,14 +54514,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -41322,144 +54529,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_cuDoubleComplex + Enumerator class for CudaPageLockedHostMemory2D_ushort3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuDoubleReal + Type: ushort4 - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -41467,137 +54674,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -41605,14 +54812,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -41620,144 +54827,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_cuDoubleReal + Enumerator class for CudaPageLockedHostMemory2D_ushort4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuFloatComplex + Type: int - + - Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -41765,137 +54972,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -41903,14 +55110,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -41918,144 +55125,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_cuFloatComplex + Enumerator class for CudaPageLockedHostMemory2D_int - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuFloatReal + Type: int1 - + - Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -42063,137 +55270,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -42201,14 +55408,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -42216,144 +55423,144 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_cuFloatReal + Enumerator class for CudaPageLockedHostMemory2D_int1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: dim3 + Type: int2 - + - Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - + - Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc without flags. In elements In elements - + - Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc. In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -42361,137 +55568,137 @@ Y-index in elements - + Synchron copy host to 2D Array - + Synchron copy host to 2D Array - + Synchron copy 2D Array to host - + Synchron copy 2D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron Copy host to pitched device - + Synchron Copy host to pitched device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 2D Array - + Asynchron copy host to 2D Array - + Asynchron copy 2D Array to host - + Asynchron copy 2D Array to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron Copy host to pitched device @@ -42499,14 +55706,14 @@ - + Asynchron Copy host to pitched device - + Asynchron copy device to host @@ -42514,6591 +55721,7813 @@ - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory2D_dim3 + Enumerator class for CudaPageLockedHostMemory2D_int2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: int3 - + - Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device - + + - + - Synchron copy host to 3D Array + Synchron Copy host to pitched device - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Asynchron Copy host to device + Asynchron copy host to 2D Array - + - + + + Asynchron copy host to 2D Array + + + + + + + Asynchron copy 2D Array to host + + + + + + + Asynchron copy 2D Array to host + + + + + Asynchron Copy host to device - + - + Asynchron copy device to host - + + + Asynchron Copy host to device + + + + + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_byte + Enumerator class for CudaPageLockedHostMemory2D_int3 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: int4 - + - Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uchar1 + Enumerator class for CudaPageLockedHostMemory2D_int4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint - + - Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uchar2 + Enumerator class for CudaPageLockedHostMemory2D_uint - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint1 - + - Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uchar3 + Enumerator class for CudaPageLockedHostMemory2D_uint1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint2 - + - Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uchar4 + Enumerator class for CudaPageLockedHostMemory2D_uint2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint3 - + - Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy host to 3D array + Synchron copy host to device - + - + - Synchron copy host to 3D Array + Synchron copy host to device - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - - + - Asynchron copy device to host + Synchron copy device to host - + - + - Asynchron copy device to host + Synchron copy device to host - - + - Asynchron copy host to 3D array + Asynchron copy host to 2D Array - + - Asynchron copy host to 3D Array + Asynchron copy host to 2D Array - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + + - + - Passes back the flags that were specified when allocating the pinned host buffer + Asynchron copy device to host - + + - + - Enumerator class for CudaPageLockedHostMemory3D_sbyte + Asynchron Copy host to device + + - + - + Asynchron copy device to host - + + - + + + Asynchron Copy host to pitched device + + + + + + + + Asynchron Copy host to pitched device + + + + + + + Asynchron copy device to host + + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Passes back the flags that were specified when allocating the pinned host buffer + + + + + + Enumerator class for CudaPageLockedHostMemory2D_uint3 + + + + + + + + + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint4 - + - Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_char1 + Enumerator class for CudaPageLockedHostMemory2D_uint4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: long - + - Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_char2 + Enumerator class for CudaPageLockedHostMemory2D_long - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: long1 - + - Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_char3 + Enumerator class for CudaPageLockedHostMemory2D_long1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: long2 - + - Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_char4 + Enumerator class for CudaPageLockedHostMemory2D_long2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: ulong - + - Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_short + Enumerator class for CudaPageLockedHostMemory2D_ulong - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: ulong1 - + - Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_short1 + Enumerator class for CudaPageLockedHostMemory2D_ulong1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: ulong2 - + - Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_short2 + Enumerator class for CudaPageLockedHostMemory2D_ulong2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float - + - Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy host to 3D array + Synchron copy host to device - + - + - Synchron copy host to 3D Array + Synchron copy host to device - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - - + - Asynchron copy device to host + Synchron copy device to host - + - + - Asynchron copy device to host + Synchron copy device to host - - + - Asynchron copy host to 3D array + Asynchron copy host to 2D Array - + - Asynchron copy host to 3D Array + Asynchron copy host to 2D Array - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron Copy host to pitched device + + + + + + + + Asynchron Copy host to pitched device + + + + + + + Asynchron copy device to host + + + + + + + + Asynchron copy device to host + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_short3 + Enumerator class for CudaPageLockedHostMemory2D_float - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float1 - + - Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_short4 + Enumerator class for CudaPageLockedHostMemory2D_float1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float2 - + - Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ushort + Enumerator class for CudaPageLockedHostMemory2D_float2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float3 - + - Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc. - + Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc. + In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ushort1 + Enumerator class for CudaPageLockedHostMemory2D_float3 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float4 - + - Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ushort2 + Enumerator class for CudaPageLockedHostMemory2D_float4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: double - + - Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy host to device + Synchron copy host to 2D Array - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy device to host + Synchron copy 2D Array to host - + - + - Synchron copy host to 3D array + Synchron copy host to device - + - + - Synchron copy host to 3D Array + Synchron copy host to device - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Synchron copy 3D Array to host + Synchron copy device to host - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - + - + - Asynchron Copy host to device + Synchron Copy host to pitched device - - + - Asynchron copy device to host + Synchron copy device to host - + - + - Asynchron copy device to host + Synchron copy device to host - - + - Asynchron copy host to 3D array + Asynchron copy host to 2D Array - + - Asynchron copy host to 3D Array + Asynchron copy host to 2D Array - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + - Asynchron copy 3D Array to host + Asynchron copy 2D Array to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + + - + - Passes back the flags that were specified when allocating the pinned host buffer + Asynchron copy device to host + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron Copy host to pitched device + + + + + + + + Asynchron Copy host to pitched device + + + + + + + Asynchron copy device to host + + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ushort3 + Enumerator class for CudaPageLockedHostMemory2D_double - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: double1 - + - Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ushort4 + Enumerator class for CudaPageLockedHostMemory2D_double1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: double2 - + - Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_int + Enumerator class for CudaPageLockedHostMemory2D_double2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuDoubleComplex - + - Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_int1 + Enumerator class for CudaPageLockedHostMemory2D_cuDoubleComplex - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuDoubleReal - + - Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_int2 + Enumerator class for CudaPageLockedHostMemory2D_cuDoubleReal - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuFloatComplex - + - Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_int3 + Enumerator class for CudaPageLockedHostMemory2D_cuFloatComplex - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuFloatReal - + - Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_int4 + Enumerator class for CudaPageLockedHostMemory2D_cuFloatReal - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: dim3 - + - Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc without flags. In elements In elements - In elements - + - Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc. In elements In elements - In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - - - Depth in elements - - - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. X-index in elements Y-index in elements - Z-index in elements - + + + Synchron copy host to 2D Array + + + + + + Synchron copy host to 2D Array + + + + + + Synchron copy 2D Array to host + + + + + + Synchron copy 2D Array to host + + + + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to 3D array + Synchron Copy host to pitched device + + + + + + + Synchron Copy host to pitched device + + + + + + Synchron copy device to host + + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 2D Array + - + - Synchron copy host to 3D Array + Asynchron copy host to 2D Array + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + - Synchron copy 3D Array to host + Asynchron copy 2D Array to host + - + Asynchron Copy host to device - + - Asynchron Copy host to device + Asynchron copy device to host - + - + - Asynchron copy device to host + Asynchron Copy host to device - + Asynchron copy device to host - + - + - Asynchron copy host to 3D array + Asynchron Copy host to pitched device - + + - + - Asynchron copy host to 3D Array + Asynchron Copy host to pitched device - + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + + - + - Asynchron copy 3D Array to host + Asynchron copy device to host - + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uint + Enumerator class for CudaPageLockedHostMemory2D_dim3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -49106,86 +63535,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_byte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -49194,162 +63623,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uint1 + Enumerator class for CudaPageLockedHostMemory3D_byte - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -49357,86 +63786,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -49445,162 +63874,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uint2 + Enumerator class for CudaPageLockedHostMemory3D_uchar1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -49608,86 +64037,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -49696,162 +64125,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uint3 + Enumerator class for CudaPageLockedHostMemory3D_uchar2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -49859,86 +64288,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -49947,162 +64376,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_uint4 + Enumerator class for CudaPageLockedHostMemory3D_uchar3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -50110,86 +64539,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -50198,162 +64627,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_long + Enumerator class for CudaPageLockedHostMemory3D_uchar4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -50361,86 +64790,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -50449,162 +64878,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_long1 + Enumerator class for CudaPageLockedHostMemory3D_sbyte - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -50612,86 +65041,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -50700,162 +65129,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_long2 + Enumerator class for CudaPageLockedHostMemory3D_char1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -50863,86 +65292,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -50951,162 +65380,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ulong + Enumerator class for CudaPageLockedHostMemory3D_char2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -51114,86 +65543,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -51202,162 +65631,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ulong1 + Enumerator class for CudaPageLockedHostMemory3D_char3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -51365,86 +65794,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -51453,162 +65882,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_ulong2 + Enumerator class for CudaPageLockedHostMemory3D_char4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -51616,86 +66045,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_short and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -51704,162 +66133,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_float + Enumerator class for CudaPageLockedHostMemory3D_short - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -51867,86 +66296,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -51955,162 +66384,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_float1 + Enumerator class for CudaPageLockedHostMemory3D_short1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -52118,86 +66547,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -52206,162 +66635,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_float2 + Enumerator class for CudaPageLockedHostMemory3D_short2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -52369,86 +66798,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -52457,162 +66886,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_float3 + Enumerator class for CudaPageLockedHostMemory3D_short3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -52620,86 +67049,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -52708,162 +67137,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_float4 + Enumerator class for CudaPageLockedHostMemory3D_short4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -52871,86 +67300,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -52959,162 +67388,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_double + Enumerator class for CudaPageLockedHostMemory3D_ushort - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -53122,86 +67551,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -53210,162 +67639,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_double1 + Enumerator class for CudaPageLockedHostMemory3D_ushort1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -53373,86 +67802,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -53461,162 +67890,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_double2 + Enumerator class for CudaPageLockedHostMemory3D_ushort2 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -53624,86 +68053,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -53712,162 +68141,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_cuDoubleComplex + Enumerator class for CudaPageLockedHostMemory3D_ushort3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -53875,86 +68304,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -53963,162 +68392,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_cuDoubleReal + Enumerator class for CudaPageLockedHostMemory3D_ushort4 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -54126,86 +68555,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_int and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -54214,162 +68643,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_cuFloatComplex + Enumerator class for CudaPageLockedHostMemory3D_int - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -54377,86 +68806,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -54465,162 +68894,162 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_cuFloatReal + Enumerator class for CudaPageLockedHostMemory3D_int1 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc In elements Width including alignment in bytes @@ -54628,86 +69057,86 @@ In elements - + - Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. In elements Width including alignment in bytes In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc without flags. + Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc without flags. In elements In elements In elements - + - Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. - Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc. + Creates a new CudaPageLockedHostMemory3D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc. In elements In elements In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Width in elements - + Height in elements - + Depth in elements - + Pitch in bytes - + Size in bytes - + Type size in bytes - + Access array per element. @@ -54716,9024 +69145,6700 @@ Z-index in elements - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to 3D array - + Synchron copy host to 3D Array - + Synchron copy 3D Array to host - + Synchron copy 3D Array to host - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron copy host to 3D array - + Asynchron copy host to 3D Array - + Asynchron copy 3D Array to host - + Asynchron copy 3D Array to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory3D_dim3 + Enumerator class for CudaPageLockedHostMemory3D_int2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: byte + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_int3 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar1 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to device + Synchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy host to device + Synchron copy host to 3D Array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - in bytes - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - in bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D array - bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D Array - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - + - Asynchron Copy host to device + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - + Device Pointer - + - Asynchron Copy host to device + Passes back the flags that were specified when allocating the pinned host buffer - - + - - - Asynchron copy device to host - - - - - - - Asynchron copy device to host - - - - - - - Asynchron Copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron Copy host to device - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - Device Pointer - - - - Passes back the flags that were specified when allocating the pinned host buffer - - - - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_int4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar2 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_uint and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - Asynchron copy 1D Array to host - - - - bytes - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_uint - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar3 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array + Size in bytes - - - + - Synchron copy host to 1D Array + Type size in bytes - - + - Synchron copy host to 1D Array + Access array per element. - + X-index in elements + Y-index in elements + Z-index in elements + - + - Synchron copy host to 1D Array + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy device to host - + - + - Synchron copy 1D Array to host + Synchron copy device to host - + - + - Synchron copy 1D Array to host + Synchron copy host to 3D array - - + - + - Synchron copy host to device + Synchron copy host to 3D Array - + - + - Synchron copy host to device + Synchron copy 3D Array to host - + - + - Synchron copy device to host + Synchron copy 3D Array to host - + - + - Synchron copy device to host + Asynchron Copy host to device + - + - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - - - Asynchron copy host to 1D Array - - - - - - - Asynchron copy host to 1D Array - - - - - - - Asynchron copy host to 1D Array - - - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - - - - Asynchron copy 1D Array to host - - - - - - - Asynchron copy 1D Array to host - - - - - - - Asynchron copy 1D Array to host - - - - bytes - - - - Asynchron Copy host to device - - - - - - - Asynchron Copy host to device + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_uint1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uchar4 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - Asynchron copy 1D Array to host - - - - bytes - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_uint2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: sbyte + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array - - - - - - - Synchron copy host to 1D Array - - - - - - Synchron copy host to 1D Array - - - - - - Synchron copy host to 1D Array - - - - - - - Synchron copy 1D Array to host - - - - - - - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_uint3 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char1 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_uint4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char2 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements - - - - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. - - - In elements - - - - For dispose - - - - - Dispose - - - - - For IDisposable - - + In elements + Width including alignment in bytes + In elements + In elements - + - Pointer to pinned host memory. + Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc without flags. + In elements + In elements + In elements - + - Size in bytes + Creates a new CudaPageLockedHostMemory3D_long and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc. + In elements + In elements + In elements + - + - Size in elements + For dispose - + - Access array per element. + Dispose - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + For IDisposable + - + - Synchron copy host to 1D Array + Pointer to pinned host memory. - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_long - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char3 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to device + Synchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy host to device + Synchron copy host to 3D Array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - in bytes - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - in bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D array - bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D Array - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - + - Asynchron Copy host to device + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - + Device Pointer - + - Asynchron Copy host to device + Passes back the flags that were specified when allocating the pinned host buffer - - + - + - Asynchron copy device to host + Enumerator class for CudaPageLockedHostMemory3D_long1 - - - + - Asynchron copy device to host + - - + - + - Asynchron Copy host to device + - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron Copy host to device - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - Device Pointer - - - - Passes back the flags that were specified when allocating the pinned host buffer - - - - - - Enumerator class for CudaPageLockedHostMemory - - - - - - - - - + - + - + + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: char4 + Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc + In elements + Width including alignment in bytes + In elements + In elements + - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements - + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc without flags. - In elements + In elements + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc. - - In elements + In elements + In elements + In elements + - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_long2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array + Size in bytes - - - + - Synchron copy host to 1D Array + Type size in bytes - - + - Synchron copy host to 1D Array + Access array per element. - + X-index in elements + Y-index in elements + Z-index in elements + - + - Synchron copy host to 1D Array + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Synchron copy host to 3D array - + - Synchron copy 1D Array to host + Synchron copy host to 3D Array - + - Synchron copy 1D Array to host + Synchron copy 3D Array to host - - + - + - Synchron copy host to device + Synchron copy 3D Array to host - + - + - Synchron copy host to device + Asynchron Copy host to device + - + - Synchron copy device to host + Asynchron Copy host to device - + + - + - Synchron copy device to host + Asynchron copy device to host + - + - Synchron copy host to device + Asynchron copy device to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + + - + - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - - - Asynchron copy host to 1D Array + Asynchron copy host to 3D array - - - Asynchron copy host to 1D Array - - - - - + - Asynchron copy host to 1D Array + Asynchron copy host to 3D Array - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - - - Asynchron copy 1D Array to host - - - - - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - - - - Asynchron Copy host to device - - - - - - - Asynchron Copy host to device - - - - - - - Asynchron copy device to host - - - - - - - Asynchron copy device to host - - - - - - - Asynchron Copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron Copy host to device - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_ulong - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short1 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_ulong1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short2 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array - - - - - - - Synchron copy host to 1D Array - - - - - - Synchron copy host to 1D Array - - - - - - Synchron copy host to 1D Array - - - - - - - Synchron copy 1D Array to host - - - - - - - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_ulong2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short3 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_float and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_float - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: short4 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_float1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to device + Synchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy host to device + Synchron copy host to 3D Array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - in bytes - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - in bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D array - bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D Array - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - + - Asynchron Copy host to device + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - + Device Pointer - + - Asynchron Copy host to device + Passes back the flags that were specified when allocating the pinned host buffer - - + - + - Asynchron copy device to host + Enumerator class for CudaPageLockedHostMemory3D_float2 - - - + - Asynchron copy device to host + - - + - + - Asynchron Copy host to device + - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron Copy host to device - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - Device Pointer - - - - Passes back the flags that were specified when allocating the pinned host buffer - - - - - - Enumerator class for CudaPageLockedHostMemory - - - - - - - - - + - + - + + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort1 + Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc + In elements + Width including alignment in bytes + In elements + In elements + - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements - + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc without flags. - In elements + In elements + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc. - - In elements + In elements + In elements + In elements + - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_float3 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort2 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array + Size in bytes - - - + - Synchron copy host to 1D Array + Type size in bytes - - + - Synchron copy host to 1D Array + Access array per element. - + X-index in elements + Y-index in elements + Z-index in elements + - + - Synchron copy host to 1D Array + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy host to device - - + - + - Synchron copy 1D Array to host + Synchron copy device to host - + - + - Synchron copy 1D Array to host + Synchron copy device to host - + - + - Synchron copy 1D Array to host + Synchron copy host to 3D array - - + - + - Synchron copy host to device + Synchron copy host to 3D Array - + - + - Synchron copy host to device + Synchron copy 3D Array to host - + - + - Synchron copy device to host + Synchron copy 3D Array to host - + - + - Synchron copy device to host + Asynchron Copy host to device + - + - Synchron copy host to device + Asynchron Copy host to device - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + + - + - Synchron copy host to device + Asynchron copy device to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + + - + - Synchron copy device to host + Asynchron copy device to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - + - in bytes - + - Asynchron copy host to 1D Array + Asynchron copy host to 3D array - - - Asynchron copy host to 1D Array - - - - - + - Asynchron copy host to 1D Array + Asynchron copy host to 3D Array - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - - - Asynchron copy 1D Array to host - - - - - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - - - - Asynchron Copy host to device - - - - - - - Asynchron Copy host to device - - - - - - - Asynchron copy device to host - - - - - - - Asynchron copy device to host - - - - - - - Asynchron Copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron Copy host to device - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - - Asynchron copy device to host - - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_float4 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort3 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_double and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_double - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ushort4 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + - Size in bytes + Width in elements - + - Size in elements + Height in elements - + - Access array per element. + Depth in elements - index in elements - - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Pitch in bytes - + - Synchron copy host to 1D Array + Size in bytes - - - + - Synchron copy host to 1D Array + Type size in bytes - - + - Synchron copy host to 1D Array + Access array per element. - - - - - Synchron copy host to 1D Array - - - - - - - Synchron copy 1D Array to host - - - - - - - Synchron copy 1D Array to host - - - - - - Synchron copy 1D Array to host - - - - - - Synchron copy 1D Array to host - - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - Asynchron copy 1D Array to host - - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_double1 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_double2 - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int1 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - Asynchron copy host to 1D Array - - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_cuDoubleComplex - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int2 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_cuDoubleReal - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int3 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy host to device - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Synchron copy device to host - - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy - - - - Asynchron copy host to 1D Array - - - - in bytes - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D array - - - - - Asynchron copy host to 1D Array - - - - + - Asynchron copy host to 1D Array + Synchron copy host to 3D Array - - in bytes - - - - Asynchron copy 1D Array to host - - - - bytes - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - - - - Asynchron copy 1D Array to host - - - - + - Asynchron copy 1D Array to host + Synchron copy 3D Array to host - - bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_cuFloatComplex - + - + - + - + - + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: int4 + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc + Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. - In elements + In elements + Width including alignment in bytes + In elements + In elements - + - Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! - hostPointer won't be freed while disposing. + Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc without flags. - - In elements + In elements + In elements + In elements - + + + Creates a new CudaPageLockedHostMemory3D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - - - Size in bytes - - - - - Size in elements - - - - - Access array per element. - - index in elements - - - - - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - - - - - Synchron copy host to 1D Array - - - - - + - Synchron copy host to 1D Array + Width in elements - - + - Synchron copy host to 1D Array + Height in elements - - + - Synchron copy host to 1D Array + Depth in elements - - - + - Synchron copy 1D Array to host + Pitch in bytes - - - + - Synchron copy 1D Array to host + Size in bytes - - + - Synchron copy 1D Array to host + Type size in bytes - - + - Synchron copy 1D Array to host + Access array per element. - - + X-index in elements + Y-index in elements + Z-index in elements + - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + - Synchron copy host to device + Synchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy host to device + Synchron copy host to 3D Array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Synchron copy device to host + Synchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - in bytes - + - Asynchron copy host to 1D Array + Asynchron Copy host to device - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - + - Asynchron copy host to 1D Array + Asynchron copy device to host - + - in bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D array - bytes - + - Asynchron copy 1D Array to host + Asynchron copy host to 3D Array - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - + - + - Asynchron copy 1D Array to host + Asynchron copy 3D Array to host - bytes - + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Passes back the flags that were specified when allocating the pinned host buffer + + + + + + Enumerator class for CudaPageLockedHostMemory3D_cuFloatReal + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + + + + + Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc + + In elements + Width including alignment in bytes + In elements + In elements + + + + + Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + + In elements + Width including alignment in bytes + In elements + In elements + + + + Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc without flags. + + In elements + In elements + In elements + + + + Creates a new CudaPageLockedHostMemory3D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags. + Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc. + + In elements + In elements + In elements + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Width in elements + + + + + Height in elements + + + + + Depth in elements + + + + + Pitch in bytes + + + + + Size in bytes + + + + + Type size in bytes + + + + + Access array per element. + + X-index in elements + Y-index in elements + Z-index in elements + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Synchron copy host to 3D array + + + + + + Synchron copy host to 3D Array + + + + + + Synchron copy 3D Array to host + + + + + + Synchron copy 3D Array to host + + + + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Asynchron Copy host to device + Asynchron copy host to 3D array - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron Copy host to device + Asynchron copy host to 3D Array - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - Pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + - Asynchron copy device to host + Asynchron copy 3D Array to host - - Offset to source pointer in bytes - Offset to destination pointer in bytes - Bytes to copy + - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + - Enumerator class for CudaPageLockedHostMemory + Enumerator class for CudaPageLockedHostMemory3D_dim3 - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint + Type: byte - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -63741,126 +75846,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -63869,7 +75974,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -63878,7 +75983,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -63887,7 +75992,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -63896,7 +76001,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -63904,21 +76009,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -63926,7 +76031,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -63934,21 +76039,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -63956,35 +76061,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -63994,7 +76099,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -64004,7 +76109,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64014,7 +76119,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64024,70 +76129,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint1 + Type: uchar1 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -64095,126 +76200,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -64223,7 +76328,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -64232,7 +76337,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64241,7 +76346,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64250,7 +76355,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -64258,21 +76363,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -64280,7 +76385,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -64288,21 +76393,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -64310,35 +76415,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -64348,7 +76453,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -64358,7 +76463,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64368,7 +76473,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64378,70 +76483,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint2 + Type: uchar2 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -64449,126 +76554,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -64577,7 +76682,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -64586,7 +76691,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64595,7 +76700,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64604,7 +76709,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -64612,21 +76717,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -64634,7 +76739,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -64642,21 +76747,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -64664,35 +76769,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -64702,7 +76807,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -64712,7 +76817,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64722,7 +76827,7 @@ Bytes to copy - + Asynchron copy device to host @@ -64732,70 +76837,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint3 + Type: uchar3 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -64803,126 +76908,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -64931,7 +77036,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -64940,7 +77045,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64949,7 +77054,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -64958,7 +77063,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -64966,21 +77071,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -64988,7 +77093,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -64996,21 +77101,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -65018,35 +77123,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -65056,7 +77161,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -65066,7 +77171,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65076,7 +77181,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65086,70 +77191,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: uint4 + Type: uchar4 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -65157,126 +77262,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -65285,7 +77390,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -65294,7 +77399,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -65303,7 +77408,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -65312,7 +77417,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -65320,21 +77425,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -65342,7 +77447,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -65350,21 +77455,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -65372,35 +77477,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -65410,7 +77515,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -65420,7 +77525,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65430,7 +77535,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65440,70 +77545,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long + Type: sbyte - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -65511,126 +77616,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -65639,7 +77744,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -65648,7 +77753,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -65657,7 +77762,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -65666,7 +77771,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -65674,21 +77779,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -65696,7 +77801,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -65704,21 +77809,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -65726,35 +77831,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -65764,7 +77869,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -65774,7 +77879,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65784,7 +77889,7 @@ Bytes to copy - + Asynchron copy device to host @@ -65794,70 +77899,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long1 + Type: char1 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -65865,126 +77970,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -65993,7 +78098,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -66002,7 +78107,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66011,7 +78116,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66020,7 +78125,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -66028,21 +78133,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -66050,7 +78155,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -66058,21 +78163,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -66080,35 +78185,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -66118,7 +78223,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -66128,7 +78233,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66138,7 +78243,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66148,70 +78253,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: long2 + Type: char2 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -66219,126 +78324,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -66347,7 +78452,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -66356,7 +78461,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66365,7 +78470,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66374,7 +78479,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -66382,21 +78487,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -66404,7 +78509,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -66412,21 +78517,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -66434,35 +78539,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -66472,7 +78577,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -66482,7 +78587,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66492,7 +78597,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66502,70 +78607,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong + Type: char3 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -66573,126 +78678,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -66701,7 +78806,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -66710,7 +78815,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66719,7 +78824,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -66728,7 +78833,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -66736,21 +78841,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -66758,7 +78863,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -66766,21 +78871,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -66788,35 +78893,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -66826,7 +78931,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -66836,7 +78941,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66846,7 +78951,7 @@ Bytes to copy - + Asynchron copy device to host @@ -66856,70 +78961,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong1 + Type: char4 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -66927,126 +79032,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -67055,7 +79160,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -67064,7 +79169,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67073,7 +79178,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67082,7 +79187,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -67090,21 +79195,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -67112,7 +79217,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -67120,21 +79225,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -67142,35 +79247,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -67180,7 +79285,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -67190,7 +79295,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67200,7 +79305,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67210,70 +79315,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: ulong2 + Type: short - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -67281,126 +79386,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -67409,7 +79514,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -67418,7 +79523,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67427,7 +79532,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67436,7 +79541,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -67444,21 +79549,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -67466,7 +79571,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -67474,21 +79579,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -67496,35 +79601,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -67534,7 +79639,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -67544,7 +79649,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67554,7 +79659,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67564,70 +79669,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float + Type: short1 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -67635,126 +79740,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -67763,7 +79868,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -67772,7 +79877,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67781,7 +79886,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -67790,7 +79895,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -67798,21 +79903,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -67820,7 +79925,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -67828,21 +79933,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -67850,35 +79955,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -67888,7 +79993,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -67898,7 +80003,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67908,7 +80013,7 @@ Bytes to copy - + Asynchron copy device to host @@ -67918,70 +80023,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float1 + Type: short2 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -67989,126 +80094,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -68117,7 +80222,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -68126,7 +80231,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68135,7 +80240,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68144,7 +80249,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -68152,21 +80257,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -68174,7 +80279,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -68182,21 +80287,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -68204,35 +80309,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -68242,7 +80347,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -68252,7 +80357,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68262,7 +80367,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68272,70 +80377,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float2 + Type: short3 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -68343,126 +80448,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -68471,7 +80576,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -68480,7 +80585,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68489,7 +80594,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68498,7 +80603,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -68506,21 +80611,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -68528,7 +80633,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -68536,21 +80641,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -68558,35 +80663,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -68596,7 +80701,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -68606,7 +80711,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68616,7 +80721,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68626,70 +80731,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float3 + Type: short4 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -68697,126 +80802,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -68825,7 +80930,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -68834,7 +80939,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68843,7 +80948,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -68852,7 +80957,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -68860,21 +80965,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -68882,7 +80987,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -68890,21 +80995,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -68912,35 +81017,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -68950,7 +81055,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -68960,7 +81065,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68970,7 +81075,7 @@ Bytes to copy - + Asynchron copy device to host @@ -68980,70 +81085,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: float4 + Type: ushort - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -69051,126 +81156,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -69179,7 +81284,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -69188,7 +81293,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69197,7 +81302,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69206,7 +81311,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -69214,21 +81319,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -69236,7 +81341,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -69244,21 +81349,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -69266,35 +81371,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -69304,7 +81409,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -69314,7 +81419,7 @@ Bytes to copy - + Asynchron copy device to host @@ -69324,7 +81429,7 @@ Bytes to copy - + Asynchron copy device to host @@ -69334,70 +81439,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double + Type: ushort1 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -69405,126 +81510,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -69533,7 +81638,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -69542,7 +81647,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69551,7 +81656,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69560,7 +81665,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -69568,21 +81673,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -69590,7 +81695,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -69598,21 +81703,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -69620,35 +81725,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -69658,7 +81763,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -69668,7 +81773,7 @@ Bytes to copy - + Asynchron copy device to host @@ -69678,7 +81783,7 @@ Bytes to copy - + Asynchron copy device to host @@ -69688,70 +81793,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double1 + Type: ushort2 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -69759,126 +81864,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -69887,7 +81992,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -69896,7 +82001,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69905,7 +82010,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -69914,7 +82019,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -69922,21 +82027,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -69944,7 +82049,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -69952,21 +82057,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -69974,35 +82079,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -70012,7 +82117,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -70022,7 +82127,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70032,7 +82137,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70042,70 +82147,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: double2 + Type: ushort3 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -70113,126 +82218,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -70241,7 +82346,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -70250,7 +82355,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70259,7 +82364,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70268,7 +82373,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -70276,21 +82381,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -70298,7 +82403,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -70306,21 +82411,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -70328,35 +82433,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -70366,7 +82471,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -70376,7 +82481,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70386,7 +82491,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70396,70 +82501,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuDoubleComplex + Type: ushort4 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -70467,126 +82572,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -70595,7 +82700,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -70604,7 +82709,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70613,7 +82718,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70622,7 +82727,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -70630,21 +82735,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -70652,7 +82757,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -70660,21 +82765,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -70682,35 +82787,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -70720,7 +82825,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -70730,7 +82835,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70740,7 +82845,7 @@ Bytes to copy - + Asynchron copy device to host @@ -70750,70 +82855,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuDoubleReal + Type: int - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -70821,126 +82926,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -70949,7 +83054,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -70958,7 +83063,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70967,7 +83072,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -70976,7 +83081,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -70984,21 +83089,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -71006,7 +83111,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -71014,21 +83119,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -71036,35 +83141,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -71074,7 +83179,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -71084,7 +83189,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71094,7 +83199,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71104,70 +83209,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuFloatComplex + Type: int1 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -71175,126 +83280,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -71303,7 +83408,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -71312,7 +83417,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -71321,7 +83426,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -71330,7 +83435,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -71338,21 +83443,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -71360,7 +83465,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -71368,21 +83473,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -71390,35 +83495,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -71428,7 +83533,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -71438,7 +83543,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71448,7 +83553,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71458,70 +83563,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: cuFloatReal + Type: int2 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -71529,126 +83634,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -71657,7 +83762,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -71666,7 +83771,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -71675,7 +83780,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -71684,7 +83789,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -71692,21 +83797,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -71714,7 +83819,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -71722,21 +83827,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -71744,35 +83849,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -71782,7 +83887,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -71792,7 +83897,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71802,7 +83907,7 @@ Bytes to copy - + Asynchron copy device to host @@ -71812,70 +83917,70 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - Type: dim3 + Type: int3 - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc In elements - + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost In elements - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! hostPointer won't be freed while disposing. @@ -71883,126 +83988,126 @@ In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Access array per element. index in elements - + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Synchron copy host to device @@ -72011,7 +84116,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy host to device @@ -72020,7 +84125,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -72029,7 +84134,7 @@ Offset to destination pointer in bytes Bytes to copy - + Synchron copy device to host @@ -72038,7 +84143,7 @@ Offset to destination pointer in bytes Bytes to copy - + Asynchron copy host to 1D Array @@ -72046,21 +84151,21 @@ in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -72068,7 +84173,7 @@ in bytes - + Asynchron copy 1D Array to host @@ -72076,21 +84181,21 @@ bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -72098,35 +84203,35 @@ bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Asynchron Copy host to device @@ -72136,7 +84241,7 @@ Bytes to copy - + Asynchron Copy host to device @@ -72146,7 +84251,7 @@ Bytes to copy - + Asynchron copy device to host @@ -72156,7 +84261,7 @@ Bytes to copy - + Asynchron copy device to host @@ -72166,6460 +84271,9035 @@ Bytes to copy - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Passes back the flags that were specified when allocating the pinned host buffer - + Enumerator class for CudaPageLockedHostMemory - + - + - + - + - + - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: byte + Type: int4 - + - Creates a new CudaRegisteredHostMemory_byte from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uchar1 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_uchar1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. - - + + In elements - + - Synchron copy 1D Array to host + For dispose - + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + - + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + Synchron copy 1D Array to host + - + + + Synchron copy 1D Array to host + + + + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uchar2 + Type: uint1 - + - Creates a new CudaRegisteredHostMemory_uchar2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uchar3 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_uchar3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint2 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. - - - - + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uchar4 + Type: uint3 - + - Creates a new CudaRegisteredHostMemory_uchar4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: sbyte + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_sbyte from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: uint4 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: char1 + Type: long - + - Creates a new CudaRegisteredHostMemory_char1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: char2 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_char2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: long1 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: char3 + Type: long2 - + - Creates a new CudaRegisteredHostMemory_char3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: char4 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_char4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: ulong - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: short + Type: ulong1 - + - Creates a new CudaRegisteredHostMemory_short from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: short1 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_short1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: ulong2 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: short2 + Type: float - + - Creates a new CudaRegisteredHostMemory_short2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: short3 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_short3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float1 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: short4 + Type: float2 - + - Creates a new CudaRegisteredHostMemory_short4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ushort + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_ushort from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: float3 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ushort1 + Type: float4 - + - Creates a new CudaRegisteredHostMemory_ushort1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ushort2 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_ushort2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: double - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ushort3 + Type: double1 - + - Creates a new CudaRegisteredHostMemory_ushort3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ushort4 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_ushort4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: double2 - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: int + Type: cuDoubleComplex - + - Creates a new CudaRegisteredHostMemory_int from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: int1 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_int1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuDoubleReal - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: int2 + Type: cuFloatComplex - + - Creates a new CudaRegisteredHostMemory_int2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + - Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Asynchron Copy host to device - Device Pointer + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with - - - - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + Asynchron Copy host to device + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: int3 + Asynchron copy device to host + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - Creates a new CudaRegisteredHostMemory_int3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Asynchron copy device to host - must be page size aligned (4KBytes) - In elements + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + - + - For dispose + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Dispose + Passes back the flags that were specified when allocating the pinned host buffer + - + - For IDisposable + Enumerator class for CudaPageLockedHostMemory - - + - Pointer to pinned host memory. + + - + - Size in bytes + - + - Size in elements + - + - Returns register status + - + - Access array per element. + - index in elements - + - Synchron copy host to 1D Array + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + Type: cuFloatReal - - - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - + In elements + - + - Synchron copy host to 1D Array + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost - + In elements - + - Synchron copy host to 1D Array - + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Access array per element. + + index in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. - cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for - natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: int4 + Type: dim3 - + - Creates a new CudaRegisteredHostMemory_int4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemHostAlloc - must be page size aligned (4KBytes) In elements + - + + + Creates a new CudaPageLockedHostMemory and allocates the memory on host. Using cuMemAllocHost + + In elements + + + + Creates a new CudaPageLockedHostMemory from an existing IntPtr. IntPtr must point to page locked memory! + hostPointer won't be freed while disposing. + + + In elements + + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - - - Returns register status - - - + Access array per element. index in elements - + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + Synchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + in bytes - + Asynchron copy 1D Array to host - + bytes - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + bytes - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + + + Asynchron Copy host to device + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron Copy host to device + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + Pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + + + Asynchron copy device to host + + + Offset to source pointer in bytes + Offset to destination pointer in bytes + Bytes to copy + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + - Page-locks the memory range specified by p and bytesize and maps it - for the device(s) as specified by Flags. This memory range also is added - to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - calls to functions such as . Since the memory can be accessed - directly by the device, it can be read or written with much higher bandwidth - than pageable memory that has not been registered. Page-locking excessive - amounts of memory may degrade system performance, since it reduces the amount - of memory available to the system for paging. As a result, this function is - best used sparingly to register staging areas for data exchange between - host and device. - The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + Passes back the flags that were specified when allocating the pinned host buffer - + - - - Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + + + Enumerator class for CudaPageLockedHostMemory - + + + + + + + + + + + + + + + + + + + + + + + + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uint + Type: byte - + - Creates a new CudaRegisteredHostMemory_uint from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_byte from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -78627,21 +93307,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -78649,7 +93329,7 @@ - + Asynchron copy 1D Array to host @@ -78657,21 +93337,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -78679,41 +93359,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -78726,151 +93406,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uint1 + Type: uchar1 - + - Creates a new CudaRegisteredHostMemory_uint1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_uchar1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -78878,21 +93558,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -78900,7 +93580,7 @@ - + Asynchron copy 1D Array to host @@ -78908,21 +93588,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -78930,41 +93610,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -78977,151 +93657,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uint2 + Type: uchar2 - + - Creates a new CudaRegisteredHostMemory_uint2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_uchar2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -79129,21 +93809,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -79151,7 +93831,7 @@ - + Asynchron copy 1D Array to host @@ -79159,21 +93839,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -79181,41 +93861,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -79228,151 +93908,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uint3 + Type: uchar3 - + - Creates a new CudaRegisteredHostMemory_uint3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_uchar3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -79380,21 +94060,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -79402,7 +94082,7 @@ - + Asynchron copy 1D Array to host @@ -79410,21 +94090,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -79432,41 +94112,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -79479,151 +94159,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: uint4 + Type: uchar4 - + - Creates a new CudaRegisteredHostMemory_uint4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_uchar4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -79631,21 +94311,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -79653,7 +94333,7 @@ - + Asynchron copy 1D Array to host @@ -79661,21 +94341,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -79683,41 +94363,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -79730,151 +94410,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: long + Type: sbyte - + - Creates a new CudaRegisteredHostMemory_long from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_sbyte from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -79882,21 +94562,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -79904,7 +94584,7 @@ - + Asynchron copy 1D Array to host @@ -79912,21 +94592,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -79934,41 +94614,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -79981,151 +94661,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: long1 + Type: char1 - + - Creates a new CudaRegisteredHostMemory_long1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_char1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -80133,21 +94813,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -80155,7 +94835,7 @@ - + Asynchron copy 1D Array to host @@ -80163,21 +94843,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -80185,41 +94865,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -80232,151 +94912,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: long2 + Type: char2 - + - Creates a new CudaRegisteredHostMemory_long2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_char2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -80384,21 +95064,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -80406,7 +95086,7 @@ - + Asynchron copy 1D Array to host @@ -80414,21 +95094,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -80436,41 +95116,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -80483,151 +95163,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ulong + Type: char3 - + - Creates a new CudaRegisteredHostMemory_ulong from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_char3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -80635,21 +95315,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -80657,7 +95337,7 @@ - + Asynchron copy 1D Array to host @@ -80665,21 +95345,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -80687,41 +95367,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -80734,151 +95414,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ulong1 + Type: char4 - + - Creates a new CudaRegisteredHostMemory_ulong1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_char4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -80886,21 +95566,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -80908,7 +95588,7 @@ - + Asynchron copy 1D Array to host @@ -80916,21 +95596,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -80938,41 +95618,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -80985,151 +95665,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: ulong2 + Type: short - + - Creates a new CudaRegisteredHostMemory_ulong2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_short from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -81137,21 +95817,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -81159,7 +95839,7 @@ - + Asynchron copy 1D Array to host @@ -81167,21 +95847,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -81189,41 +95869,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -81236,151 +95916,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: float + Type: short1 - + - Creates a new CudaRegisteredHostMemory_float from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_short1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -81388,21 +96068,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -81410,7 +96090,7 @@ - + Asynchron copy 1D Array to host @@ -81418,21 +96098,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -81440,41 +96120,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -81487,151 +96167,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: float1 + Type: short2 - + - Creates a new CudaRegisteredHostMemory_float1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_short2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -81639,21 +96319,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -81661,7 +96341,7 @@ - + Asynchron copy 1D Array to host @@ -81669,21 +96349,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -81691,41 +96371,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -81738,151 +96418,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: float2 + Type: short3 - + - Creates a new CudaRegisteredHostMemory_float2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_short3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -81890,21 +96570,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -81912,7 +96592,7 @@ - + Asynchron copy 1D Array to host @@ -81920,21 +96600,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -81942,41 +96622,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -81989,151 +96669,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: float3 + Type: short4 - + - Creates a new CudaRegisteredHostMemory_float3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_short4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -82141,21 +96821,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -82163,7 +96843,7 @@ - + Asynchron copy 1D Array to host @@ -82171,21 +96851,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -82193,41 +96873,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -82240,151 +96920,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: float4 + Type: ushort - + - Creates a new CudaRegisteredHostMemory_float4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_ushort from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -82392,21 +97072,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -82414,7 +97094,7 @@ - + Asynchron copy 1D Array to host @@ -82422,21 +97102,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -82444,41 +97124,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -82491,151 +97171,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: double + Type: ushort1 - + - Creates a new CudaRegisteredHostMemory_double from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_ushort1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -82643,21 +97323,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -82665,7 +97345,7 @@ - + Asynchron copy 1D Array to host @@ -82673,21 +97353,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -82695,41 +97375,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -82742,151 +97422,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: double1 + Type: ushort2 - + - Creates a new CudaRegisteredHostMemory_double1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_ushort2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -82894,21 +97574,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -82916,7 +97596,7 @@ - + Asynchron copy 1D Array to host @@ -82924,21 +97604,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -82946,41 +97626,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -82993,151 +97673,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: double2 + Type: ushort3 - + - Creates a new CudaRegisteredHostMemory_double2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_ushort3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -83145,21 +97825,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -83167,7 +97847,7 @@ - + Asynchron copy 1D Array to host @@ -83175,21 +97855,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -83197,41 +97877,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -83244,151 +97924,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: cuDoubleComplex + Type: ushort4 - + - Creates a new CudaRegisteredHostMemory_cuDoubleComplex from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_ushort4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -83396,21 +98076,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -83418,7 +98098,7 @@ - + Asynchron copy 1D Array to host @@ -83426,21 +98106,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -83448,41 +98128,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -83495,151 +98175,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: cuDoubleReal + Type: int - + - Creates a new CudaRegisteredHostMemory_cuDoubleReal from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_int from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -83647,21 +98327,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -83669,7 +98349,7 @@ - + Asynchron copy 1D Array to host @@ -83677,21 +98357,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -83699,41 +98379,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -83746,151 +98426,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: cuFloatComplex + Type: int1 - + - Creates a new CudaRegisteredHostMemory_cuFloatComplex from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_int1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -83898,21 +98578,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -83920,7 +98600,7 @@ - + Asynchron copy 1D Array to host @@ -83928,21 +98608,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -83950,41 +98630,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -83997,151 +98677,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: cuFloatReal + Type: int2 - + - Creates a new CudaRegisteredHostMemory_cuFloatReal from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_int2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -84149,21 +98829,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -84171,7 +98851,7 @@ - + Asynchron copy 1D Array to host @@ -84179,21 +98859,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -84201,41 +98881,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -84248,151 +98928,151 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for natively allocated memory (Marshal.AllocHGlobal, or a native dll). - Type: dim3 + Type: int3 - + - Creates a new CudaRegisteredHostMemory_dim3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + Creates a new CudaRegisteredHostMemory_int3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! must be page size aligned (4KBytes) In elements - + For dispose - + Dispose - + For IDisposable - + Pointer to pinned host memory. - + Size in bytes - + Size in elements - + Returns register status - + Access array per element. index in elements - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy host to 1D Array - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy 1D Array to host - + Synchron copy host to device - + Synchron copy host to device - + Synchron copy device to host - + Synchron copy device to host - + Asynchron copy host to 1D Array @@ -84400,21 +99080,21 @@ - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array - + Asynchron copy host to 1D Array @@ -84422,7 +99102,7 @@ - + Asynchron copy 1D Array to host @@ -84430,21 +99110,21 @@ - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host - + Asynchron copy 1D Array to host @@ -84452,41 +99132,41 @@ - + Asynchron Copy host to device - + Asynchron Copy host to device - + Asynchron copy device to host - + Asynchron copy device to host - + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag Device Pointer - + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -84499,4706 +99179,3759 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . - - - - - Cuda Surface Object - - - - - Creates a surface object. ResDesc describes - the data to perform surface load/stores on. ResDesc.resType must be - and ResDesc.hArray - must be set to a valid CUDA array handle. ResDesc.flags must be set to zero. - - CudaResourceDesc - - - - Creates a surface object. ResDesc describes - the data to perform surface load/stores on. ResDesc.resType must be - and ResDesc.hArray - must be set to a valid CUDA array handle. + The base address must be the same one specified to . - CudaArray1D - + - Creates a surface object. ResDesc describes - the data to perform surface load/stores on. ResDesc.resType must be - and ResDesc.hArray - must be set to a valid CUDA array handle. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: int4 - CudaArray2D - + - Creates a surface object. ResDesc describes - the data to perform surface load/stores on. ResDesc.resType must be - and ResDesc.hArray - must be set to a valid CUDA array handle. + Creates a new CudaRegisteredHostMemory_int4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - CudaArray3D + must be page size aligned (4KBytes) + In elements - + For dispose - + Dispose - + For IDisposable - + - Returns the wrapped CUsurfObject + Pointer to pinned host memory. - + - Returns the CudaResourceDesc used to create the CudaSurfObject + Size in bytes - + - Cuda Texure Object + Size in elements - + - Creates a texture object and returns it in pTexObject. pResDesc describes the data to texture from. pTexDesc - describes how the data should be sampled. + Returns register status - CudaResourceDesc - CudaTextureDescriptor - + - Creates a texture object. ResDesc describes the data to texture from. TexDesc - describes how the data should be sampled. resViewDesc is an optional argument that specifies an alternate format - for the data described by pResDesc, and also describes the subresource region to restrict access to when texturing. - pResViewDesc can only be specified if the type of resource is a CUDA array or a CUDA mipmapped array. + Access array per element. - Describes the data to texture from. - Describes how the data should be sampled. - CudaResourceViewDesc. Only valid if type of resource is a CUDA array or a CUDA mipmapped array + index in elements + - + - For dispose + Synchron copy host to 1D Array + + - + - Dispose + Synchron copy host to 1D Array + - + - For IDisposable + Synchron copy host to 1D Array - + - + - Returns the wrapped CUtexObject + Synchron copy host to 1D Array + + - + - Returns the CudaResourceDesc used to create the CudaTexObject + Synchron copy 1D Array to host + + - + - Returns the CudaTextureDescriptor used to create the CudaTexObject + Synchron copy 1D Array to host + - + - Returns the CudaResourceViewDesc used to create the CudaTexObject + Synchron copy 1D Array to host + - + - Provides methods to bind texture references to kernels + Synchron copy 1D Array to host + + - + - Create a new CudaDeviceVariable and bind it to a texture reference. + Synchron copy host to device - - - - - - In elements + - + - Bind a CudaDeviceVariable to a texture reference. + Synchron copy host to device - - - - - - + - + - Create a new CudaPitchedDeviceVariable and bind it to a texture reference. + Synchron copy device to host - - - - - - - In elements - In elements + - + - Create a new CudaPitchedDeviceVariable and bind it to a texture reference. + Synchron copy device to host - - - - - - - - In elements - In elements + - + - Bind a CudaPitchedDeviceVariable to a texture reference. + Asynchron copy host to 1D Array - - - - - - - + + + - + - Bind a CudaPitchedDeviceVariable to a texture reference. + Asynchron copy host to 1D Array - - - - - - - - + + - + - Create a new CudaArray1D and bind it to a texture reference. + Asynchron copy host to 1D Array - - - - - - - In elements - + + - + - Bind a CudaArray1D to a texture reference. + Asynchron copy host to 1D Array - - - - - + + - + - Create a new CudaArray2D and bind it to a texture reference. + Asynchron copy 1D Array to host - - - - - - - In elements - In elements - 1,2 or 4 + + + - + - Create a new CudaArray2D and bind it to a texture reference. + Asynchron copy 1D Array to host - - - - - - - - In elements - In elements - 1,2 or 4 + + - + - Bind a CudaArray2D to a texture reference. + Asynchron copy 1D Array to host - - - - - + - + - Bind a CudaArray2D to a texture reference. + Asynchron copy 1D Array to host - - - - - - + + - + - Create a new CudaArray3D and bind it to a texture reference. + Asynchron Copy host to device - - - - - - - In elements - In elements - In elements - 1,2 or 4 + + - + - Create a new CudaArray3D and bind it to a texture reference. + Asynchron Copy host to device - - - - - - - - - In elements - In elements - In elements - 1,2 or 4 + + - + - Bind a CudaArray3D to a texture reference. + Asynchron copy device to host - - - - - - + + - + - Bind a CudaArray3D to a texture reference. + Asynchron copy device to host - - - - - - - - + + - + - Create a new CudaMipmappedArray and bind it to a texture reference. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - - - - - - - - - - - + Device Pointer - + - Create a new CudaMipmappedArray and bind it to a texture reference. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - - - - - - - - - - - - - - - - Bind a CudaMipmappedArray to a texture reference. + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - - - - - - - - - - - - + - Bind a CudaMipmappedArray to a texture reference. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: uint - - - - - - - - - - - - - - + - Create a new CudaDeviceVariable and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Creates a new CudaRegisteredHostMemory_uint from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - - - RGBA color - - + must be page size aligned (4KBytes) In elements - + - Bind a CudaDeviceVariable to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + For dispose - - - - - - RGBA color - + - Create a new CudaPitchedDeviceVariable and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Dispose - - - - - - In elements - In elements - RGBA color - + - Bind a CudaPitchedDeviceVariable to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + For IDisposable - - - - - - - RGBA color + - + - Create a new CudaArray1D and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Pointer to pinned host memory. - - - - - - In elements - - RGBA color - + - Bind a CudaArray1D to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Size in bytes - - - - - - RGBA color - + - Create a new CudaArray2D and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Size in elements - - - - - - In elements - In elements - 1,2 or 4 - RGBA color - + - Bind a CudaArray2D to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Returns register status - - - - - - RGBA color - + - Create a new CudaArray3D and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Access array per element. - - - - - - In elements - In elements - In elements - 1,2 or 4 - RGBA color + index in elements + - + - Bind a CudaArray3D to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Synchron copy host to 1D Array - - - - - - RGBA color + + - + - Create a new CudaMipmappedArray and bind it to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Synchron copy host to 1D Array - - - - - - - - - - - - RGBA color + - + - Bind a CudaMipmappedArray to a texture reference. - Sets the border color for the texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - addressMode is set to CU_TR_ADDRESS_MODE_BORDER + Synchron copy host to 1D Array - - - - - - - - - - RGBA color - + - CudaArrayTexture1D + Synchron copy host to 1D Array + + - + - Creates a new 1D texture from array memory. Allocates new array. + Synchron copy 1D Array to host - - - - - - - In elements - + + - + - Creates a new 1D texture from array memory + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host - - - - - - + - For dispose + Synchron copy 1D Array to host + + - + - Dispose + Synchron copy host to device + - + - For IDisposable + Synchron copy host to device - + - + - TextureReference + Synchron copy device to host + - + - Flags + Synchron copy device to host + - + - AddressMode + Asynchron copy host to 1D Array + + + - + - Format + Asynchron copy host to 1D Array + + - + - Format + Asynchron copy host to 1D Array + + - + - Size + Asynchron copy host to 1D Array + + + - + - ChannelSize + Asynchron copy 1D Array to host + + + - + - TotalSizeInBytes + Asynchron copy 1D Array to host + + - + - NumChannels + Asynchron copy 1D Array to host + + - + - Name + Asynchron copy 1D Array to host + + + - + - Module + Asynchron Copy host to device + + - + - CUFuntion + Asynchron Copy host to device + + - + - Array + Asynchron copy device to host + + - + - CudaArrayTexture2D + Asynchron copy device to host + + - + - Creates a new 2D texture from array memory. Allocates a new 2D array. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - - - - - - In elements - In elements - 1,2 or 4 + Device Pointer - + - Creates a new 2D texture from array memory. Allocates a new 2D array. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - - - - - - - In elements - In elements - 1,2 or 4 - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Creates a new 2D texture from array memory + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: uint1 - - - - - - - + - Creates a new 2D texture from array memory + Creates a new CudaRegisteredHostMemory_uint1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - - - - - - - + must be page size aligned (4KBytes) + In elements - + For dispose - + Dispose - + For IDisposable - + - TextureReference + Pointer to pinned host memory. - + - Flags + Size in bytes - + - AddressMode + Size in elements - + - AddressMode + Returns register status - + - Format + Access array per element. + index in elements + - + - Format + Synchron copy host to 1D Array + + - + - Height + Synchron copy host to 1D Array + - + - Width + Synchron copy host to 1D Array + - + - ChannelSize + Synchron copy host to 1D Array + + - + - TotalSizeInBytes + Synchron copy 1D Array to host + + - + - NumChannels + Synchron copy 1D Array to host + - + - Name + Synchron copy 1D Array to host + - + - Module + Synchron copy 1D Array to host + + - + - CUFuntion + Synchron copy host to device + - + - Array + Synchron copy host to device + - + - CudaArrayTexture3D + Synchron copy device to host + - + - Creates a new 3D texture from array memory. Allocates a new 3D array. + Synchron copy device to host - - - - - - - In elements - In elements - In elements - 1,2 or 4 + - + - Creates a new 3D texture from array memory. Allocates a new 3D array. + Asynchron copy host to 1D Array - - - - - - - - - In elements - In elements - In elements - 1,2 or 4 + + + - + - Creates a new 3D texture from array memory + Asynchron copy host to 1D Array - - - - - - + + - + - Creates a new 3D texture from array memory + Asynchron copy host to 1D Array - - - - - - - + - + - For dispose + Asynchron copy host to 1D Array + + + - + - Dispose + Asynchron copy 1D Array to host + + + - + - For IDisposable + Asynchron copy 1D Array to host - + + - + - TextureReference + Asynchron copy 1D Array to host + + - + - Flags + Asynchron copy 1D Array to host + + + - + - AddressMode + Asynchron Copy host to device + + - + - AddressMode + Asynchron Copy host to device + + - + - AddressMode + Asynchron copy device to host + + - + - Format + Asynchron copy device to host + + - + - Filtermode + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Depth + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + - - - Height + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - + - Width + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: uint2 - + - ChannelSize + Creates a new CudaRegisteredHostMemory_uint2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - - + must be page size aligned (4KBytes) + In elements + + - TotalSizeInBytes + For dispose - + - NumChannels + Dispose - + - Name + For IDisposable + - + - Module + Pointer to pinned host memory. - + - CUFuntion + Size in bytes - + - Array + Size in elements - + - A variable located in CUDA device memory + Returns register status - variable base type - + - Creates a new CudaDeviceVariable and allocates the memory on the device + Access array per element. - In elements + index in elements + - + - Creates a new CudaDeviceVariable from an existing CUdeviceptr. The allocated size is gethered via the CUDA API. - devPtr won't be freed while disposing. + Synchron copy host to 1D Array - + + - + - Creates a new CudaDeviceVariable from an existing CUdeviceptr. The allocated size is gethered via the CUDA API. + Synchron copy host to 1D Array - - The CUdeviceptr will be freed while disposing, if the CudaDeviceVariable is the owner + - + - Creates a new CudaDeviceVariable from an existing CUdeviceptr. - devPtr won't be freed while disposing. + Synchron copy host to 1D Array - - Size in Bytes + - + - Creates a new CudaDeviceVariable from an existing CUdeviceptr. + Synchron copy host to 1D Array - - The CUdeviceptr will be freed while disposing, if the CudaDeviceVariable is the owner - Size in Bytes + + - + - Creates a new CudaDeviceVariable from definition in cu-file. + Synchron copy 1D Array to host - The module where the variable is defined in. - The variable name as defined in the cu-file. + + - + - Creates a new CudaDeviceVariable from definition in cu-file. + Synchron copy 1D Array to host - The kernel which module defines the variable. - The variable name as defined in the cu-file. + - + - For dispose + Synchron copy 1D Array to host + - + - Dispose + Synchron copy 1D Array to host + + - + - For IDisposable + Synchron copy host to device - + - + - Copy data from device to device memory + Synchron copy host to device - Source pointer to device memory + - + - Copy data from device to device memory + Synchron copy device to host - Source pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes + - + - Copy data from device to device memory + Synchron copy device to host - Source + - + - Copy data from device to device memory + Asynchron copy host to 1D Array - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes + + + - + - Copy from device to device memory + Asynchron copy host to 1D Array - Source + + - + - Copy from device to device memory + Asynchron copy host to 1D Array - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Width of 2D memory to copy in bytes - Height in elements + + - + - Copy data from host to device memory + Asynchron copy host to 1D Array - Source pointer to host memory + + + - + - Copy data from host to device memory + Asynchron copy 1D Array to host - Source pointer to host memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes + + + - + - Copy data from host to device memory + Asynchron copy 1D Array to host - Source pointer to host memory + + - + - Copy data from host to device memory + Asynchron copy 1D Array to host - Source pointer to host memory - Offset to destination pointer in bytes + + - + - Copy data from host to device memory + Asynchron copy 1D Array to host - Source pointer to host memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes + + + - + - Copy data from host to device memory + Asynchron Copy host to device - Source pointer to host memory - Offset to destination pointer in bytes + + - + - Copy data from host to device memory + Asynchron Copy host to device - Source pointer to host memory + + - + - Copy from Host to device memory. Array elements can be of any (value)type, but total size in bytes must match to allocated device memory. + Asynchron copy device to host - Source + + - + - Copy data from device to host memory + Asynchron copy device to host - Destination pointer to host memory + + - + - Copy data from device to host memory + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - Destination pointer to host memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes + Device Pointer - + - Copy data from device to host memory + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - Destination data in host memory + - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Copy data from device to host memory + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: uint3 - Destination data in host memory - Offset to source pointer in bytes - + - Copy data from device to host memory + Creates a new CudaRegisteredHostMemory_uint3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - Destination pointer to host memory + must be page size aligned (4KBytes) + In elements - + - Copy data from device to host memory + For dispose - Destination data in host memory - Offset to source pointer in bytes - + - Copy data from device to host memory + Dispose - Destination pointer to host memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes - + - Copy data from this device to host memory. Array elements can be of any (value)type, but total size in bytes must match to allocated device memory. + For IDisposable - Destination + - + - Async Copy data from device to device memory + Pointer to pinned host memory. - Source pointer to device memory - - + - Async Copy data from device to device memory + Size in bytes - Source - - + - Async Copy from device to device memory + Size in elements - Source - - + - Async Copy data from device to device memory + Returns register status - Source pointer to device memory - - + - Async Copy data from device to device memory + Access array per element. - Source - + index in elements + - + - Async Copy from device to device memory + Synchron copy host to 1D Array - Source - + + - + - Async Copy data from device to device memory + Synchron copy host to 1D Array - Source pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes - + - + - Async Copy data from device to device memory + Synchron copy host to 1D Array - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes - + - + - Async Copy data from device to device memory + Synchron copy host to 1D Array - Source pointer to device memory - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes - + + - + - Async Copy data from device to device memory + Synchron copy 1D Array to host - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Size to copy in bytes - + + - + - Async Copy from device to device memory + Synchron copy 1D Array to host - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Width of 2D memory to copy in bytes - Height in elements - + - + - Async Copy from device to device memory + Synchron copy 1D Array to host - Source - Offset to source pointer in bytes - Offset to destination pointer in bytes - Width of 2D memory to copy in bytes - Height in elements - + - + - Memset + Synchron copy 1D Array to host - + + - + - Memset + Synchron copy host to device - + - + - Memset + Synchron copy host to device - + - + - Memset + Synchron copy device to host - - + - + - Memset + Synchron copy device to host - - + - + - Memset + Asynchron copy host to 1D Array - + + - + - Copies from device memory in one context to device memory in another context + Asynchron copy host to 1D Array - Destination context - Source pointer to device memory - Source context + + - + - Copies from device memory in one context to device memory in another context + Asynchron copy host to 1D Array - Destination context - Source pointer to device memory - Source context + + - + - Async-Copies from device memory in one context to device memory in another context + Asynchron copy host to 1D Array - Destination context - Source pointer to device memory - Source context + + - + - Async-Copies from device memory in one context to device memory in another context + Asynchron copy 1D Array to host - Destination context - Source pointer to device memory - Source context + + - + - Access array elements directly from host. - Each single access invokes a device to host or host to device copy. Access is therefor rather slow. + Asynchron copy 1D Array to host - index in elements - + + - + - Device pointer + Asynchron copy 1D Array to host + + - + - Size in bytes + Asynchron copy 1D Array to host + + + - + - Type size in bytes + Asynchron Copy host to device + + - + - Size in elements + Asynchron Copy host to device + + - + - If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + Asynchron copy device to host + + - + - Converts a device variable to a host array + Asynchron copy device to host - device variable - newly allocated host array with values from device memory + + - + - Converts a device variable to a host value. In case of multiple device values, only the first value is copied. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - device variable - newly allocated host variable with value from device memory + Device Pointer - + - Converts a host array to a newly allocated device variable. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - host array - newly allocated device variable with values from host memory + - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Converts a host array to a newly allocated device variable. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: uint4 - host array - newly allocated device variable with values from host memory - + - Gets a null-pointer equivalent + Creates a new CudaRegisteredHostMemory_uint4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + must be page size aligned (4KBytes) + In elements - + - A CUDA exception is thrown if a CUDA Driver API method call does not return + For dispose - + - + Dispose - + - + For IDisposable - - + - + - + Pointer to pinned host memory. - - + - + Size in bytes - - + - + Size in elements - - - + - + Returns register status - - - - + - + Access array per element. + index in elements - + - + Synchron copy host to 1D Array - - + + - + - + Synchron copy host to 1D Array + - + - Error name as returned by CUDA driver API + Synchron copy host to 1D Array + - + - Error description as returned by CUDA driver API + Synchron copy host to 1D Array + + - + - Groupes several wrapped CUgraphicsResources together, so that the map() call to the CUDA API can be efficiently on all - resources together. + Synchron copy 1D Array to host + + - + - Creates a new CudaGraphicsInteropResourceCollection + Synchron copy 1D Array to host + - + - For dispose + Synchron copy 1D Array to host + - + - Returns the number of resources in the collection + Synchron copy 1D Array to host + + - + - Adds a new resource to the collection + Synchron copy host to device - + - + - Removes all resources in the collection, an disposes every element. + Synchron copy host to device + - + - Returns true, if the given resource is part of the collection + Synchron copy device to host - - + - + - Throws NotImplementedException. + Synchron copy device to host - - + - + - Removes a resource from the collection. The resource is not disposed. + Asynchron copy host to 1D Array - - + + + - + - Dispose + Asynchron copy host to 1D Array + + - + - For IDisposable + Asynchron copy host to 1D Array - + + - + - Returns the ICudaGraphicsInteropResource at index index. + Asynchron copy host to 1D Array - - + + + - + - Maps all graphics resources for access by CUDA. - The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource - was registered should not access any resources while they are mapped by CUDA. If an application does - so, the results are undefined. - This function provides the synchronization guarantee that any graphics calls issued before - will complete before any subsequent CUDA work issued in stream begins. - If any of the resources is presently mapped for access by CUDA then exception is thrown. + Asynchron copy 1D Array to host + + + - + - Maps all graphics resources for access by CUDA. - The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource - was registered should not access any resources while they are mapped by CUDA. If an application does - so, the results are undefined. - This function provides the synchronization guarantee that any graphics calls issued before - will complete before any subsequent CUDA work issued in stream begins. - If any of the resources is presently mapped for access by CUDA then exception is thrown. + Asynchron copy 1D Array to host + - + - Maps all graphics resources for access by CUDA. - The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource - was registered should not access any resources while they are mapped by CUDA. If an application does - so, the results are undefined. - This function provides the synchronization guarantee that any graphics calls issued before - will complete before any subsequent CUDA work issued in stream begins. - If any of the resources is presently mapped for access by CUDA then exception is thrown. + Asynchron copy 1D Array to host + + - + - Unmaps all graphics resources. - Once unmapped, the resources may not be accessed by CUDA until they are mapped again. - This function provides the synchronization guarantee that any CUDA work issued in stream before - will complete before any subsequently issued graphics work begins. - If any of the resources are not presently mapped for access by CUDA then exception is thrown. + Asynchron copy 1D Array to host + + - + - Helper methods used in the wrapper framework + Asynchron Copy host to device + + - + - Returns the number of channels used in textures depending on the given type. + Asynchron Copy host to device - Type - Number of channels + + - + - Returns the channel size of an CUDA array in bytes. + Asynchron copy device to host - Channel format - Size in bytes + + - + - CudaLinearTexture2D + Asynchron copy device to host + + - + - Creates a new 2D texture from linear memory. Allocates a new device variable + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - - - - - - - In elements - In elements + Device Pointer - + - Creates a new 2D texture from linear memory. Allocates a new device variable + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - - - - - - - In elements - In elements - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Creates a new 2D texture from linear memory. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: long - - - - - - - - + - Creates a new 2D texture from linear memory. + Creates a new CudaRegisteredHostMemory_long from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - - - - - - - - + must be page size aligned (4KBytes) + In elements - + For dispose - + Dispose - + For IDisposable - + - TextureReference + Pointer to pinned host memory. - + - Flags + Size in bytes - + - AddressMode + Size in elements - + - AddressMode + Returns register status - + - Format + Access array per element. + index in elements + - + - Format + Synchron copy host to 1D Array + + - + - Height + Synchron copy host to 1D Array + - + - Width + Synchron copy host to 1D Array + - + - ChannelSize + Synchron copy host to 1D Array + + - + - TotalSizeInBytes + Synchron copy 1D Array to host + + - + - NumChannels + Synchron copy 1D Array to host + - + - Name + Synchron copy 1D Array to host + - + - Module + Synchron copy 1D Array to host + + - + - CUFunction + Synchron copy host to device + - + - Device variable in linear Memory + Synchron copy host to device + - + - Binds a linear address range to the texture reference. - Any previous address or CUDA array state associated with the texture reference is superseded by this function. - Any memory previously bound to the texture reference is unbound. - Size my differ to the previous bound variable, but type must be the same. + Synchron copy device to host - New device variable to bind this texture reference to. + - + - A variable located in CUDA device memory. The data is aligned following + Synchron copy device to host - variable base type + - + - Creates a new CudaPitchedDeviceVariable and allocates the memory on the device + Asynchron copy host to 1D Array - In elements - In elements + + + - + - Creates a new CudaPitchedDeviceVariable and allocates the memory on the device + Asynchron copy host to 1D Array - In elements - In elements - Group pack elements as one type. E.g. 4 floats in host code to one float4 in device code + + - + - Creates a new CudaPitchedDeviceVariable from an existing CUdeviceptr - The CUdeviceptr won't be freed when disposing. + Asynchron copy host to 1D Array - - In elements - In elements - In bytes + + - + - Creates a new CudaPitchedDeviceVariable from an existing CUdeviceptr + Asynchron copy host to 1D Array - - In elements - In elements - In bytes - The CUdeviceptr will be freed while disposing if the CudaPitchedDeviceVariable is the owner + + + - + - For dispose + Asynchron copy 1D Array to host + + + - + - Dispose + Asynchron copy 1D Array to host + + - + - For IDisposable + Asynchron copy 1D Array to host - + + - + - Copy from device to device memory + Asynchron copy 1D Array to host - Source - - + + + + + - Copy from device to device memory + Asynchron Copy host to device - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + + - + - Copy from device to device memory + Asynchron Copy host to device - Source + + - + - Copy from device to device memory + Asynchron copy device to host - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + + - + - Copy from device to device memory + Asynchron copy device to host - Source + + - + - Copy from device to device memory + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - Source - Source pitch + Device Pointer - + - Copy from device to device memory + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + - - - Copy from Host to device memory + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - Source - + - Copy from Host to device memory + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: long1 - Source - Width in bytes - Height in elements - + - Copy from host to device memory + Creates a new CudaRegisteredHostMemory_long1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + must be page size aligned (4KBytes) + In elements - + - Copy from Host to device memory + For dispose - Source - + - Copy from Host to device memory + Dispose - Source - Width in elements - Height in elements - + - Copy from host to device memory + For IDisposable - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + - + - Copy from Host to device memory. Assumes that aHostDest has no additional line padding. + Pointer to pinned host memory. - Source - + - Copy from host to device memory + Size in bytes - Source - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch - + - Copy data from device to host memory + Size in elements - IntPtr to destination in host memory - + - Copy data from device to host memory + Returns register status - IntPtr to destination in host memory - Width in bytes - Height in elements - + - Copy data from device to host memory + Access array per element. - Destination - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + index in elements + - + - Copy data from device to host memory + Synchron copy host to 1D Array - Destination + + - + - Copy data from this device to host memory + Synchron copy host to 1D Array - Destination - Width in elements - Height in elements + - + - Copy data from device to host memory + Synchron copy host to 1D Array - Destination - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + - + - Copy data from device to host memory. Assumes that aHostDest has no additional line padding. + Synchron copy host to 1D Array - Destination + + - + - Copy data from device to host memory + Synchron copy 1D Array to host - Destination - Source X in bytes - Source Y - Destination X in bytes - Destination Y - Width in bytes - Height in elements - Source pitch - Destination pitch + + - + - Async Copy data from device to device memory + Synchron copy 1D Array to host - Source pointer to device memory - + - + - Async Copy data from device to device memory + Synchron copy 1D Array to host - Source - + - + - Async Copy from device to device memory + Synchron copy 1D Array to host - Source - + + - + - Async Copy data from device to device memory (1D Copy, copies destination pitch * height bytes data) + Synchron copy host to device - Source pointer to device memory - + - + - Async Copy data from device to device memory (1D Copy, copies destination pitch * height bytes data) + Synchron copy host to device - Source - + - + - Async Copy from device to device memory + Synchron copy device to host - Source - + - + - Memset + Synchron copy device to host - + - + - Memset + Asynchron copy host to 1D Array - + + + - + - Memset + Asynchron copy host to 1D Array - + + - + - Memset + Asynchron copy host to 1D Array - + - + - Memset + Asynchron copy host to 1D Array - + + - + - Memset + Asynchron copy 1D Array to host - + + - + - Copies from device memory in one context to device memory in another context + Asynchron copy 1D Array to host - Destination context - Source pointer to device memory - Source context + + - + - Copies from device memory in one context to device memory in another context + Asynchron copy 1D Array to host - Destination context - Source pointer to device memory - Source context + + - + - Async-Copies from device memory in one context to device memory in another context + Asynchron copy 1D Array to host - Destination context - Source pointer to device memory - Source context + + - + - Async-Copies from device memory in one context to device memory in another context + Asynchron Copy host to device - Destination context - Source pointer to device memory - Source context + - + - Access array elements directly from host. - Each single access invokes a device to host or host to device copy. Access is therefor rather slow. + Asynchron Copy host to device - X-index in elements - Y-index in elements - + + - + - Device pointer + Asynchron copy device to host + + - + - Width in elements + Asynchron copy device to host + + - + - Width in bytes + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Height in elements + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + - - - Pitch in bytes + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - + - Total size in bytes (Pitch * Height) + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: long2 - + - Type size in bytes + Creates a new CudaRegisteredHostMemory_long2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + must be page size aligned (4KBytes) + In elements - + - Converts a device variable to a host array + For dispose - device variable - newly allocated host array with values from device memory - + - Measures via CUDA events the timespan between Start() and Stop() calls. + Dispose - + - + For IDisposable + - + - + Pointer to pinned host memory. - + - + Size in bytes - + - + Size in elements - + - For dispose + Returns register status - + - Dispose + Access array per element. + index in elements + - + - For IDisposable + Synchron copy host to 1D Array - + + - + - Start measurement + Synchron copy host to 1D Array + - + - Stop measurement + Synchron copy host to 1D Array + - + - Get elapsed time in milliseconds, sync on stop event + Synchron copy host to 1D Array - Elapsed time in ms + + - + - Get elapsed time in milliseconds, no sync on stop event + Synchron copy 1D Array to host - Elapsed time in ms + + - + - Returns the inner start event + Synchron copy 1D Array to host + - + - Returns the inner stop event + Synchron copy 1D Array to host + - + - Returns the inner stream + Synchron copy 1D Array to host + + - + - Wrapps a CUstream handle. - In case of a so called NULL stream, use the native CUstream struct instead. + Synchron copy host to device + - + - Creates a new Stream using + Synchron copy host to device + - + - Creates a new wrapper for an existing stream + Synchron copy device to host + - + - Creates a new Stream + Synchron copy device to host - Parameters for stream creation (must be ) + - + - Creates a new Stream using and with the given priority - This API alters the scheduler priority of work in the stream. Work in a higher priority stream - may preempt work already executing in a low priority stream. - priority follows a convention where lower numbers represent higher priorities. - '0' represents default priority. + Asynchron copy host to 1D Array - Stream priority. Lower numbers represent higher priorities. + + + - + - Creates a new Stream using and with the given priority - This API alters the scheduler priority of work in the stream. Work in a higher priority stream - may preempt work already executing in a low priority stream. - priority follows a convention where lower numbers represent higher priorities. - '0' represents default priority. + Asynchron copy host to 1D Array - Stream priority. Lower numbers represent higher priorities. - Parameters for stream creation (must be ) + + - + - For dispose + Asynchron copy host to 1D Array + + - + - Dispose + Asynchron copy host to 1D Array + + + - + - For IDisposable + Asynchron copy 1D Array to host - + + + - + - returns the wrapped CUstream handle + Asynchron copy 1D Array to host + + - + - Waits until the device has completed all operations in the stream. If the context was created - with the flag, the CPU thread will block until the stream is finished with all of its - tasks. + Asynchron copy 1D Array to host + + - + - Returns true if all operations in the stream have completed, or - false if not. + Asynchron copy 1D Array to host - + + + - + - Make a compute stream wait on an event - Makes all future work submitted to the Stream wait until hEvent - reports completion before beginning execution. This synchronization - will be performed efficiently on the device. - - The stream will wait only for the completion of the most recent - host call to on hEvent. Once this call has returned, - any functions (including and may be - called on hEvent again, and the subsequent calls will not have any - effect on this stream. - - If hStream is 0 (the NULL stream) any future work submitted in any stream - will wait for hEvent to complete before beginning execution. This - effectively creates a barrier for all future work submitted to the context. - - If has not been called on hEvent, this call acts as if - the record has already completed, and so is a functional no-op. - - - - - - Adds a callback to be called on the host after all currently enqueued - items in the stream have completed. For each - cuStreamAddCallback call, the callback will be executed exactly once. - The callback will block later work in the stream until it is finished. - - The callback may be passed or an error code. In the event - of a device error, all subsequently executed callbacks will receive an - appropriate . - - Callbacks must not make any CUDA API calls. Attempting to use a CUDA API - will result in . Callbacks must not perform any - synchronization that may depend on outstanding device work or other callbacks - that are not mandated to run earlier. Callbacks without a mandated order - (in independent streams) execute in undefined order and may be serialized. - - This API requires compute capability 1.1 or greater. See - cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute - capability. Attempting to use this API with earlier compute versions will - return . - - The function to call once preceding stream operations are complete - User specified data to be passed to the callback function. Use GCAlloc to pin a managed object - Callback flags (must be CUStreamAddCallbackFlags.None) - - - - Here the Stream is the NULL stream - Adds a callback to be called on the host after all currently enqueued - items in the stream have completed. For each - cuStreamAddCallback call, the callback will be executed exactly once. - The callback will block later work in the stream until it is finished. - - The callback may be passed or an error code. In the event - of a device error, all subsequently executed callbacks will receive an - appropriate . - - Callbacks must not make any CUDA API calls. Attempting to use a CUDA API - will result in . Callbacks must not perform any - synchronization that may depend on outstanding device work or other callbacks - that are not mandated to run earlier. Callbacks without a mandated order - (in independent streams) execute in undefined order and may be serialized. - - This API requires compute capability 1.1 or greater. See - cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute - capability. Attempting to use this API with earlier compute versions will - return . + Asynchron Copy host to device - The function to call once preceding stream operations are complete - User specified data to be passed to the callback function. Use GCAlloc to pin a managed object - Callback flags (must be CUStreamAddCallbackFlags.None) + + - + - Query the priority of this stream + Asynchron Copy host to device - the stream's priority + + - + - Query the flags of this stream. + Asynchron copy device to host - the stream's flags - The value returned in flags is a logical 'OR' of all flags that - were used while creating this stream. + + - + - Wait on a memory location - Enqueues a synchronization of the stream on the given memory location. Work - ordered after the operation will block until the given condition on the - memory is satisfied. By default, the condition is to wait for (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. - - Other condition types can be specified via \p flags. - - If the memory was registered via ::cuMemHostRegister(), the device pointer - should be obtained with::cuMemHostGetDevicePointer(). This function cannot - be used with managed memory(::cuMemAllocManaged). - - Support for this can be queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic - support is that on Windows, a device must be in TCC mode. + Asynchron copy device to host - The memory location to wait on. - The value to compare with the memory location. - See::CUstreamWaitValue_flags. + + - + - Wait on a memory location - Enqueues a synchronization of the stream on the given memory location. Work - ordered after the operation will block until the given condition on the - memory is satisfied. By default, the condition is to wait for (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. - - Other condition types can be specified via \p flags. - - If the memory was registered via ::cuMemHostRegister(), the device pointer - should be obtained with::cuMemHostGetDevicePointer(). This function cannot - be used with managed memory(::cuMemAllocManaged). - - Support for this can be queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The requirements are - compute capability 7.0 or greater, and on Windows, that the device be in - TCC mode. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - The memory location to wait on. - The value to compare with the memory location. - See::CUstreamWaitValue_flags. + Device Pointer - + - Write a value to memory - - Write a value to memory.Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - flag is passed, the write is preceded by a system-wide memory fence, - equivalent to a __threadfence_system() but scoped to the stream - rather than a CUDA thread. - - If the memory was registered via ::cuMemHostRegister(), the device pointer - should be obtained with::cuMemHostGetDevicePointer(). This function cannot - be used with managed memory(::cuMemAllocManaged). - - Support for this can be queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic - support is that on Windows, a device must be in TCC mode. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - The device address to write to. - The value to write. - See::CUstreamWriteValue_flags. + - - - Write a value to memory - - Write a value to memory.Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER - flag is passed, the write is preceded by a system-wide memory fence, - equivalent to a __threadfence_system() but scoped to the stream - rather than a CUDA thread. - - If the memory was registered via ::cuMemHostRegister(), the device pointer - should be obtained with::cuMemHostGetDevicePointer(). This function cannot - be used with managed memory(::cuMemAllocManaged). - - Support for this can be queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The requirements are - compute capability 7.0 or greater, and on Windows, that the device be in - TCC mode. + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - The device address to write to. - The value to write. - See::CUstreamWriteValue_flags. - + - CudaLinearTexture1D + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: ulong - + - Creates a new 1D texture from linear memory. Allocates a new device variable + Creates a new CudaRegisteredHostMemory_ulong from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - - - - - + must be page size aligned (4KBytes) In elements - - - Creates a new 1D texture from linear memory. - - - - - - - - - + For dispose - + Dispose - + For IDisposable - + - TextureReference + Pointer to pinned host memory. - + - Flags + Size in bytes - + - AddressMode + Size in elements - + - Format + Returns register status - + - Filtermode + Access array per element. + index in elements + - + - Size + Synchron copy host to 1D Array + + - + - ChannelSize + Synchron copy host to 1D Array + - + - TotalSizeInBytes + Synchron copy host to 1D Array + - + - NumChannels + Synchron copy host to 1D Array + + - + - Name + Synchron copy 1D Array to host + + - + - Module + Synchron copy 1D Array to host + - + - CUFunction + Synchron copy 1D Array to host + - + - Device variable in linear Memory + Synchron copy 1D Array to host + + - + - Binds a linear address range to the texture reference. - Any previous address or CUDA array state associated with the texture reference is superseded by this function. - Any memory previously bound to the texture reference is unbound. - Size my differ to the previous bound variable, but type must be the same. + Synchron copy host to device - New device variable to bind this texture reference to. + - + - CUDA device properties + Synchron copy host to device + - + - Typical clock frequency in kilohertz + Synchron copy device to host + - + - Maximum block dimensions + Synchron copy device to host + - + - Maximum grid dimensions + Asynchron copy host to 1D Array + + + - + - Maximum number of threads per block + Asynchron copy host to 1D Array + + - + - Maximum pitch in bytes allowed by memory copies + Asynchron copy host to 1D Array + + - + - Maximum number of 32-bit registers available per block + Asynchron copy host to 1D Array + + + - + - Maximum shared memory available per block in bytes + Asynchron copy 1D Array to host + + + - + - Alignment requirement for textures + Asynchron copy 1D Array to host + + - + - Memory available on device for __constant__ variables in a CUDA C kernel in bytes + Asynchron copy 1D Array to host + + - + - Name of the device + Asynchron copy 1D Array to host + + + - + - Driver version + Asynchron Copy host to device + + - + - Total amount of global memory on the device + Asynchron Copy host to device + + - + - Number of multiprocessors on device + Asynchron copy device to host + + - + - Warp size in threads (also called SIMDWith) + Asynchron copy device to host + + - + - Device can possibly copy memory and execute a kernel concurrently + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Specifies whether there is a run time limit on kernels + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + - - - Device is integrated with host memory + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - + - Device can map host memory into CUDA address space + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: ulong1 - + - Compute mode (See CUComputeMode for details) + Creates a new CudaRegisteredHostMemory_ulong1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + must be page size aligned (4KBytes) + In elements - + - Maximum 1D texture width + For dispose - + - Maximum 2D texture width + Dispose - + - Maximum 2D texture height + For IDisposable + - + - Maximum 3D texture width + Pointer to pinned host memory. - + - Maximum 3D texture height + Size in bytes - + - Maximum 3D texture depth + Size in elements - + - Maximum texture array width + Returns register status - + - Maximum texture array height + Access array per element. + index in elements + - + - Maximum slices in a texture array + Synchron copy host to 1D Array + + - + - Alignment requirement for surfaces + Synchron copy host to 1D Array + - + - Device can possibly execute multiple kernels concurrently + Synchron copy host to 1D Array + - + - Device has ECC support enabled + Synchron copy host to 1D Array + + - + - PCI bus ID of the device + Synchron copy 1D Array to host + + - + - PCI device ID of the device + Synchron copy 1D Array to host + - + - Device is using TCC driver model + Synchron copy 1D Array to host + - + - Peak memory clock frequency in kilohertz + Synchron copy 1D Array to host + + - + - Global memory bus width in bits + Synchron copy host to device + - + - Size of L2 cache in bytes + Synchron copy host to device + - + - Maximum resident threads per multiprocessor + Synchron copy device to host + - + - Number of asynchronous engines + Synchron copy device to host + - + - Device shares a unified address space with the host + Asynchron copy host to 1D Array + + + - + - Maximum 1D layered texture width + Asynchron copy host to 1D Array + + - + - Maximum layers in a 1D layered texture + Asynchron copy host to 1D Array + + - + - PCI domain ID of the device + Asynchron copy host to 1D Array + + + - + - Pitch alignment requirement for textures + Asynchron copy 1D Array to host + + + - + - Maximum cubemap texture width/height + Asynchron copy 1D Array to host + + - + - Maximum cubemap layered texture width/height + Asynchron copy 1D Array to host + + - + - Maximum layers in a cubemap layered texture + Asynchron copy 1D Array to host + + + - + - Maximum 1D surface width + Asynchron Copy host to device + + - + - Maximum 2D surface width + Asynchron Copy host to device + + - + - Maximum 2D surface height + Asynchron copy device to host + + - + - Maximum 3D surface width + Asynchron copy device to host + + - + - Maximum 3D surface height + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Maximum 3D surface depth + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Maximum 1D layered surface width + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: ulong2 - + - Maximum layers in a 1D layered surface + Creates a new CudaRegisteredHostMemory_ulong2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + must be page size aligned (4KBytes) + In elements - + - Maximum 2D layered surface width + For dispose - + - Maximum 2D layered surface height + Dispose - + - Maximum layers in a 2D layered surface + For IDisposable + - + - Maximum cubemap surface width + Pointer to pinned host memory. - + - Maximum cubemap layered surface width + Size in bytes - + - Maximum layers in a cubemap layered surface + Size in elements - + - Maximum 1D linear texture width + Returns register status - + - Maximum 2D linear texture width + Access array per element. + index in elements + - + - Maximum 2D linear texture height + Synchron copy host to 1D Array + + - + - Maximum 2D linear texture pitch in bytes + Synchron copy host to 1D Array + - + - Maximum mipmapped 2D texture width + Synchron copy host to 1D Array + - + - Maximum mipmapped 2D texture height + Synchron copy host to 1D Array + + - + - Major compute capability version number + Synchron copy 1D Array to host + + - + - Minor compute capability version number + Synchron copy 1D Array to host + - + - Compute capability version number + Synchron copy 1D Array to host + - + - Maximum mipmapped 1D texture width + Synchron copy 1D Array to host + + - + - Device supports stream priorities + Synchron copy host to device + - + - Device supports caching globals in L1 + Synchron copy host to device + - + - Device supports caching locals in L1 + Synchron copy device to host + - + - Maximum shared memory available per multiprocessor in bytes + Synchron copy device to host + - + - Maximum number of 32-bit registers available per multiprocessor + Asynchron copy host to 1D Array + + + - + - Device can allocate managed memory on this system + Asynchron copy host to 1D Array + + - + - Device is on a multi-GPU board + Asynchron copy host to 1D Array + + - + - Unique id for a group of devices on the same multi-GPU board + Asynchron copy host to 1D Array + + + - + - Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware) + Asynchron copy 1D Array to host + + + - + - Ratio of single precision performance (in floating-point operations per second) to double precision performance + Asynchron copy 1D Array to host + + - + - Device supports coherently accessing pageable memory without calling cudaHostRegister on it + Asynchron copy 1D Array to host + + - + - Device can coherently access managed memory concurrently with the CPU + Asynchron copy 1D Array to host + + + - + - Device supports compute preemption. + Asynchron Copy host to device + + - + - Device can access host registered memory at the same virtual address as the CPU. + Asynchron Copy host to device + + - + - cuStreamBatchMemOp and related APIs are supported. + Asynchron copy device to host + + - + - 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. + Asynchron copy device to host + + - + - CU_STREAM_WAIT_VALUE_NOR is supported. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + Device Pointer - + - Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: float - + - Maximum optin shared memory per block + Creates a new CudaRegisteredHostMemory_float from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + must be page size aligned (4KBytes) + In elements - + - Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. + For dispose - + - Device supports host memory registration via ::cudaHostRegister. + Dispose - + - Device accesses pageable memory via the host's page tables. + For IDisposable + - + - The host can directly access managed memory on the device without migration. + Pointer to pinned host memory. - + - Direct3D 9 Interoperability + Size in bytes - + - Direct3D9 Interoperability for CUDA 3.x + Size in elements - + - Returns in pCudaDevice the CUDA-compatible device corresponding to the adapter name pszAdapterName - obtained from EnumDisplayDevices() or IDirect3D9::GetAdapterIdentifier(). - If no device on the adapter with name pszAdapterName is CUDA-compatible, then the call will fail. + Returns register status - Returned CUDA device corresponding to pszAdapterName - Adapter name to query for device - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Gets the CUDA devices corresponding to a Direct3D 9 device - Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding - to the Direct3D 9 device pD3D9Device. - Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices - corresponding to the Direct3D 9 device pD3D9Device. - - If any of the GPUs being used to render pDevice are not CUDA capable then the - call will return . + Access array per element. - Returned number of CUDA devices corresponding to pD3D9Device - Returned CUDA devices corresponding to pD3D9Device - The size of the output device array pCudaDevices - Direct3D 9 device to query for CUDA devices - The set of devices to return. - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + index in elements + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in - pCudaDevice. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Synchron copy host to 1D Array - Returned newly created CUDA context - Returned pointer to the device on which the context was created - Context creation flags (see for details) - Direct3D device to create interoperability context with - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Synchron copy host to 1D Array - Returned newly created CUDA context - Context creation flags (see for details) - Direct3D device to create interoperability context with - Returned pointer to the device on which the context was created - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Registers the Direct3D 9 resource pD3DResource for access by CUDA and returns a CUDA handle to - pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and - unmap this resource until it is unregistered. On success this call will increase the internal reference count on - pD3DResource. This reference count will be decremented when this resource is unregistered through . - This call is potentially high-overhead and should not be called every frame in interactive applications. - The type of pD3DResource must be one of the following: - - Type of pD3DResourceRestriction - IDirect3DVertexBuffer9 - May be accessed through a device pointer. - - IDirect3DIndexBuffer9 - May be accessed through a device pointer. - - IDirect3DSurface9 - May be accessed through an array. Only stand-alone objects of type IDirect3DSurface9 - may be explicitly shared. In particular, individual mipmap levels and faces of cube maps may not be registered - directly. To access individual surfaces associated with a texture, one must register the base texture object. - - IDirect3DBaseTexture9 - Individual surfaces on this texture may be accessed through an array. - - - The Flags argument may be used to specify additional parameters at register time. The only valid value for this - parameter is . - Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some - limitations. - • The primary rendertarget may not be registered with CUDA. - • Resources allocated as shared may not be registered with CUDA. - • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data - cannot be shared. - • Surfaces of depth or stencil formats cannot be shared. - If Direct3D interoperability is not initialized for this context using then - is returned. If pD3DResource is of incorrect type or is already registered then - is returned. If pD3DResource cannot be registered then - is returned. If Flags is not one of the above specified value then - is returned. + Synchron copy host to 1D Array - Returned graphics resource handle - Direct3D resource to register - Parameters for resource registration - CUDA Error Codes: , , , - , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Returns in ppD3DDevice the Direct3D device against which this CUDA context - was created in . + Synchron copy host to 1D Array - Returned Direct3D device corresponding to CUDA context - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Direct3D 10 Interoperability + Synchron copy 1D Array to host + + - + - Direct3D10 Interoperability for CUDA 3.x + Synchron copy 1D Array to host + - + - Returns in device the CUDA-compatible device corresponding to the adapter pAdapter obtained from - IDXGIFactory::EnumAdapters. This call will succeed only if a device on adapter pAdapter is Cuda-compatible. + Synchron copy 1D Array to host - Returned CUDA device corresponding to pszAdapterName - Adapter (type: IDXGIAdapter) - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Gets the CUDA devices corresponding to a Direct3D 10 device - Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding - to the Direct3D 10 device pD3D10Device. - Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices - corresponding to the Direct3D 10 device pD3D10Device. - - If any of the GPUs being used to render pDevice are not CUDA capable then the - call will return . + Synchron copy 1D Array to host - Returned number of CUDA devices corresponding to pD3D9Device - Returned CUDA devices corresponding to pD3D9Device - The size of the output device array pCudaDevices - Direct3D 10 device to query for CUDA devices - The set of devices to return. - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in - pCudaDevice. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Synchron copy host to device - Returned newly created CUDA context - Returned pointer to the device on which the context was created - Context creation flags (see for details) - Direct3D device to create interoperability context with - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Synchron copy host to device - Returned newly created CUDA context - Context creation flags (see for details) - Direct3D device to create interoperability context with - Returned pointer to the device on which the context was created - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Registers the Direct3D 10 resource pD3DResource for access by CUDA and returns a CUDA handle to - pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and - unmap this resource until it is unregistered. On success this call will increase the internal reference count on - pD3DResource. This reference count will be decremented when this resource is unregistered through . - This call is potentially high-overhead and should not be called every frame in interactive applications. - The type of pD3DResource must be one of the following: - - Type of pD3DResourceRestriction - ID3D10Buffer - May be accessed through a device pointer. - - ID3D10Texture1D - Individual subresources of the texture may be accessed via arrays. - - ID3D10Texture2D - Individual subresources of the texture may be accessed via arrays. - - ID3D10Texture3D - Individual subresources of the texture may be accessed via arrays. - - - The Flags argument may be used to specify additional parameters at register time. The only valid value for this - parameter is . - Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some - limitations. - • The primary rendertarget may not be registered with CUDA. - • Resources allocated as shared may not be registered with CUDA. - • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data - cannot be shared. - • Surfaces of depth or stencil formats cannot be shared. - If Direct3D interoperability is not initialized for this context using then - is returned. If pD3DResource is of incorrect type or is already registered then - is returned. If pD3DResource cannot be registered then - is returned. If Flags is not one of the above specified value then - is returned. + Synchron copy device to host - Returned graphics resource handle - Direct3D resource to register - Parameters for resource registration - CUDA Error Codes: , , , - , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Returns in ppD3DDevice the Direct3D device against which this CUDA context - was created in . + Synchron copy device to host - Returned Direct3D device corresponding to CUDA context - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Direct3D 11 Interoperability for CUDA 3.x + Asynchron copy host to 1D Array + + + - + - Returns in device the CUDA-compatible device corresponding to the adapter pAdapter obtained from - IDXGIFactory::EnumAdapters. This call will succeed only if a device on adapter pAdapter is Cuda-compatible. + Asynchron copy host to 1D Array - Returned CUDA device corresponding to pszAdapterName - Adapter (type: IDXGIAdapter) - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Gets the CUDA devices corresponding to a Direct3D 11 device - Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding - to the Direct3D 11 device pD3D11Device. - Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices - corresponding to the Direct3D 11 device pD3D11Device. - - If any of the GPUs being used to render pDevice are not CUDA capable then the - call will return . + Asynchron copy host to 1D Array - Returned number of CUDA devices corresponding to pD3D9Device - Returned CUDA devices corresponding to pD3D11Device - The size of the output device array pCudaDevices - Direct3D 11 device to query for CUDA devices - The set of devices to return. - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in - pCudaDevice. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Asynchron copy host to 1D Array - Returned newly created CUDA context - Returned pointer to the device on which the context was created - Context creation flags (see for details) - Direct3D device to create interoperability context with - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and - associates the created CUDA context with the calling thread. The created will be returned in pCtx. - Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. - On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented - upon destruction of this context through . This context will cease to function if pD3DDevice - is destroyed or encounters an error. + Asynchron copy 1D Array to host - Returned newly created CUDA context - Context creation flags (see for details) - Direct3D device to create interoperability context with - Returned pointer to the device on which the context was created - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Registers the Direct3D 11 resource pD3DResource for access by CUDA and returns a CUDA handle to - pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and - unmap this resource until it is unregistered. On success this call will increase the internal reference count on - pD3DResource. This reference count will be decremented when this resource is unregistered through . - This call is potentially high-overhead and should not be called every frame in interactive applications. - The type of pD3DResource must be one of the following: - - Type of pD3DResourceRestriction - ID3D11Buffer - May be accessed through a device pointer. - - ID3D11Texture1D - Individual subresources of the texture may be accessed via arrays. - - ID3D11Texture2D - Individual subresources of the texture may be accessed via arrays. - - ID3D11Texture3D - Individual subresources of the texture may be accessed via arrays. - - - The Flags argument may be used to specify additional parameters at register time. The only valid value for this - parameter is . - Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some - limitations. - • The primary rendertarget may not be registered with CUDA. - • Resources allocated as shared may not be registered with CUDA. - • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data - cannot be shared. - • Surfaces of depth or stencil formats cannot be shared. - If Direct3D interoperability is not initialized for this context using then - is returned. If pD3DResource is of incorrect type or is already registered then - is returned. If pD3DResource cannot be registered then - is returned. If Flags is not one of the above specified value then - is returned. + Asynchron copy 1D Array to host - Returned graphics resource handle - Direct3D resource to register - Parameters for resource registration - CUDA Error Codes: , , , - , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Returns in ppD3DDevice the Direct3D device against which this CUDA context - was created in . + Asynchron copy 1D Array to host - Returned Direct3D device corresponding to CUDA context - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - C# wrapper for the NVIDIA CUDA Driver API (--> cuda.h) + Asynchron copy 1D Array to host + + + - + - Gives the version of the wrapped api + Asynchron Copy host to device + + - + - Initializes the driver API and must be called before any other function from the driver API. Currently, - the Flags parameter must be . If has not been called, any function from the driver API will return - . + Asynchron Copy host to device - Before any call to the CUDA Driver API can be done, the API must be initialized with cuInit(0). - Currently, Flags must always be . - CUDA Error Codes: , , .Note that this function may also return error codes from previous, asynchronous launches. + + - + - Returns in driverVersion the version number of the installed CUDA driver. This function automatically returns - if the driverVersion argument is NULL. + Asynchron copy device to host - Returns the CUDA driver version - CUDA Error Codes: , .Note that this function may also return error codes from previous, asynchronous launches. + + - + - Combines all API calls for device management + Asynchron copy device to host + + - + - Returns in device a device handle given an ordinal in the range [0, -1]. + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - Returned device handle - Device number to get handle for - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + Device Pointer - + - Returns in count the number of devices with compute capability greater than or equal to 2.0 that are available for - execution. If there is no such device, returns 0. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - Returned number of compute-capable devices - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + - Returns an ASCII string identifying the device dev in the NULL-terminated string pointed to by name. len specifies - the maximum length of the string that may be returned. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: float1 - Returned identifier string for the device - Maximum length of string to store in name - Device to get identifier string for - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Return an UUID for the device - Returns 16-octets identifing the device \p dev in the structure pointed by the \p uuid. + Creates a new CudaRegisteredHostMemory_float1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - Returned UUID - Device to get identifier string for - + must be page size aligned (4KBytes) + In elements - + - Return an LUID and device node mask for the device. - Return identifying information (\p luid and \p deviceNodeMask) to allow - matching device with graphics APIs. + For dispose - Returned LUID - Returned device node mask - Device to get identifier string for - - - - Returns in major and minor the major and minor revision numbers that define the compute capability of the - device dev. - - Major revision number - Minor revision number - Device handle - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + + Dispose + - + - Returns in bytes the total amount of memory available on the device dev in bytes. + For IDisposable - Returned memory available on device in bytes - Device handle - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Returns in prop the (basic) properties of device dev. See . + Pointer to pinned host memory. - Returned properties of device - Device to get properties for - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pi the integer value of the attribute attrib on device dev. See . + Size in bytes - Returned device attribute value - Device attribute to query - Device handle - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in device a device handle given a PCI bus ID string. + Size in elements - Returned device handle - String in one of the following forms: - [domain]:[bus]:[device].[function] - [domain]:[bus]:[device] - [bus]:[device].[function] - where domain, bus, device, and function are all hexadecimal values - CUDA Error Codes: , , , - , . - + - Returns an ASCII string identifying the device dev in the NULL-terminated - string pointed to by pciBusId. len specifies the maximum length of the - string that may be returned. + Returns register status - Returned identifier string for the device in the following format - [domain]:[bus]:[device].[function] - where domain, bus, device, and function are all hexadecimal values. - pciBusId should be large enough to store 13 characters including the NULL-terminator. - Maximum length of string to store in name - Device to get identifier string for - CUDA Error Codes: , , , - , . - + - Takes as input a previously allocated event. This event must have been - created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING - flags set. This opaque handle may be copied into other processes and - opened with ::cuIpcOpenEventHandle to allow efficient hardware - synchronization between GPU work in different processes. - - After the event has been been opened in the importing process, - ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and - ::cuEventQuery may be used in either process. Performing operations - on the imported event after the exported event has been freed - with ::cuEventDestroy will result in undefined behavior. - - IPC functionality is restricted to devices with support for unified - addressing on Linux operating systems. + Access array per element. - Pointer to a user allocated CUipcEventHandle in which to return the opaque event handle - Event allocated with ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING flags. - CUDA Error Codes: , , , + index in elements + - + - Opens an interprocess event handle exported from another process with - ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like - a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. - This event must be freed with ::cuEventDestroy. - - Performing operations on the imported event after the exported event has - been freed with ::cuEventDestroy will result in undefined behavior. - - IPC functionality is restricted to devices with support for unified - addressing on Linux operating systems. + Synchron copy host to 1D Array - Returns the imported event - Interprocess handle to open - CUDA Error Codes: , , , + + - + - Takes a pointer to the base of an existing device memory allocation created - with ::cuMemAlloc and exports it for use in another process. This is a - lightweight operation and may be called multiple times on an allocation - without adverse effects. - - If a region of memory is freed with ::cuMemFree and a subsequent call - to ::cuMemAlloc returns memory with the same device address, - ::cuIpcGetMemHandle will return a unique handle for the - new memory. - - IPC functionality is restricted to devices with support for unified - addressing on Linux operating systems. + Synchron copy host to 1D Array - Pointer to user allocated ::CUipcMemHandle to return the handle in. - Base pointer to previously allocated device memory - CUDA Error Codes: , , , + - + - Maps memory exported from another process with ::cuIpcGetMemHandle into - the current device address space. For contexts on different devices - ::cuIpcOpenMemHandle can attempt to enable peer access between the - devices as if the user called ::cuCtxEnablePeerAccess. This behavior is - controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. - ::cuDeviceCanAccessPeer can determine if a mapping is possible. - - Contexts that may open ::CUipcMemHandles are restricted in the following way. - ::CUipcMemHandles from each ::CUdevice in a given process may only be opened - by one ::CUcontext per ::CUdevice per other process. - - Memory returned from ::cuIpcOpenMemHandle must be freed with - ::cuIpcCloseMemHandle. - - Calling ::cuMemFree on an exported memory region before calling - ::cuIpcCloseMemHandle in the importing context will result in undefined - behavior. - - IPC functionality is restricted to devices with support for unified - addressing on Linux operating systems. + Synchron copy host to 1D Array - Returned device pointer - ::CUipcMemHandle to open - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS - CUDA Error Codes: , , - , , + - + - Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation - in the exporting process as well as imported mappings in other processes - will be unaffected. - - Any resources used to enable peer access will be freed if this is the - last mapping using them. - - IPC functionality is restricted to devices with support for unified - addressing on Linux operating systems. + Synchron copy host to 1D Array - Device pointer returned by ::cuIpcOpenMemHandle - CUDA Error Codes: , , - , + + - + - Combines all API calls for context management + Synchron copy 1D Array to host + + - + - Creates a new CUDA context and associates it with the calling thread. The flags parameter is described in . The - context is created with a usage count of 1 and the caller of must call or - when done using the context. If a context is already current to the thread, it is supplanted by the newly created context - and may be restored by a subsequent call to . + Synchron copy 1D Array to host - Returned context handle of the new context - Context creation flags. See - Device to create context on - CUDA Error Codes: , , , - , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Destroys the CUDA context specified by ctx. If the context usage count is not equal to 1, or the context is current - to any CPU thread other than the current one, this function fails. Floating contexts (detached from a CPU thread via - ) may be destroyed by this function. + Synchron copy 1D Array to host - Context to destroy - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Destroys the CUDA context specified by ctx. The context ctx will be destroyed regardless of how many threads it is current to. - It is the responsibility of the calling function to ensure that no API call is issued to ctx while cuCtxDestroy_v2() is executing. - If ctx is current to the calling thread then ctx will also be - popped from the current thread's context stack (as though cuCtxPopCurrent() - were called). If ctx is current to other threads, then ctx will - remain current to those threads, and attempting to access ctx from - those threads will result in the error . + Synchron copy 1D Array to host - Context to destroy - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Increments the usage count of the context and passes back a context handle in pctx that must be passed to - when the application is done with the context. fails if there is no context current to the - thread. Currently, the flags parameter must be . + Synchron copy host to device - Returned context handle of the current context - Context attach flags (must be ) - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Decrements the usage count of the context ctx, and destroys the context if the usage count goes to 0. The context - must be a handle that was passed back by or , and must be current to the calling thread. + Synchron copy host to device - Context to destroy - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Pushes the given context ctx onto the CPU thread’s stack of current contexts. The specified context becomes the - CPU thread’s current context, so all CUDA functions that operate on the current context are affected. - The previous current context may be made current again by calling or . - The context must be "floating," i.e. not attached to any thread. Contexts are made to float by calling . + Synchron copy device to host - Floating context to attach - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Pushes the given context ctx onto the CPU thread’s stack of current contexts. The specified context becomes the - CPU thread’s current context, so all CUDA functions that operate on the current context are affected. - The previous current context may be made current again by calling or . - The context must be "floating," i.e. not attached to any thread. Contexts are made to float by calling . + Synchron copy device to host - Floating context to attach - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Pops the current CUDA context from the CPU thread. The CUDA context must have a usage count of 1. CUDA contexts - have a usage count of 1 upon creation; the usage count may be incremented with and decremented - with . - If successful, passes back the old context handle in pctx. That context may then be made current - to a different CPU thread by calling . - Floating contexts may be destroyed by calling . - If a context was current to the CPU thread before or was called, this function makes - that context current to the CPU thread again. + Asynchron copy host to 1D Array - Returned new context handle - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Pops the current CUDA context from the CPU thread. The CUDA context must have a usage count of 1. CUDA contexts - have a usage count of 1 upon creation; the usage count may be incremented with and decremented - with . - If successful, passes back the old context handle in pctx. That context may then be made current - to a different CPU thread by calling . - Floating contexts may be destroyed by calling . - If a context was current to the CPU thread before or was called, this function makes - that context current to the CPU thread again. + Asynchron copy host to 1D Array - Returned new context handle - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Binds the specified CUDA context to the calling CPU thread. - If ctx is NULL then the CUDA context previously bound to the - calling CPU thread is unbound and is returned. - - If there exists a CUDA context stack on the calling CPU thread, this - will replace the top of that stack with ctx. - If ctx is NULL then this will be equivalent to popping the top - of the calling CPU thread's CUDA context stack (or a no-op if the - calling CPU thread's CUDA context stack is empty). + Asynchron copy host to 1D Array - Context to bind to the calling CPU thread - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Returns in ctx the CUDA context bound to the calling CPU thread. - If no context is bound to the calling CPU thread then ctx is - set to NULL and is returned. + Asynchron copy host to 1D Array - Returned context handle - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Returns in device the ordinal of the current context’s device. + Asynchron copy 1D Array to host - Returned device ID for the current context - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Blocks until the device has completed all preceding requested tasks. returns an error if one of the - preceding tasks failed. If the context was created with the flag, the CPU thread will - block until the GPU context has finished its work. + Asynchron copy 1D Array to host - CUDA Error Codes: , , , - . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Returns the API version used to create ctx in version. If ctx - is NULL, returns the API version used to create the currently bound - context. - This wil return the API version used to create a context (for example, - 3010 or 3020), which library developers can use to direct callers to a - specific API version. Note that this API version may not be the same as - returned by . + Asynchron copy 1D Array to host - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - On devices where the L1 cache and shared memory use the same hardware - resources, this function returns through pconfig the preferred cache configuration - for the current context. This is only a preference. The driver will use - the requested configuration if possible, but it is free to choose a different - configuration if required to execute functions. - This will return a pconfig of on devices - where the size of the L1 cache and shared memory are fixed. + Asynchron copy 1D Array to host - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - On devices where the L1 cache and shared memory use the same hardware - resources, this sets through config the preferred cache configuration for - the current context. This is only a preference. The driver will use - the requested configuration if possible, but it is free to choose a different - configuration if required to execute the function. Any function preference - set via will be preferred over this context-wide - setting. Setting the context-wide cache configuration to - will cause subsequent kernel launches to prefer - to not change the cache configuration unless required to launch the kernel. - This setting does nothing on devices where the size of the L1 cache and - shared memory are fixed. - Launching a kernel with a different preference than the most recent - preference setting may insert a device-side synchronization point. + Asynchron Copy host to device - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Returns the current shared memory configuration for the current context. - - This function will return in \p pConfig the current size of shared memory banks - in the current context. On devices with configurable shared memory banks, - can be used to change this setting, so that all - subsequent kernel launches will by default use the new bank size. When - is called on devices without configurable shared - memory, it will return the fixed bank size of the hardware. - - The returned bank configurations can be either: - - : set shared memory bank width to - be natively four bytes. - - : set shared memory bank width to - be natively eight bytes. - - returned shared memory configuration - CUDA Error Codes: , , , - , . + + - + - Sets the shared memory configuration for the current context. - On devices with configurable shared memory banks, this function will set - the context's shared memory bank size which is used for subsequent kernel - launches. - Changed the shared memory configuration between launches may insert a device - side synchronization point between those launches. - Changing the shared memory bank size will not increase shared memory usage - or affect occupancy of kernels, but may have major effects on performance. - Larger bank sizes will allow for greater potential bandwidth to shared memory, - but will change what kinds of accesses to shared memory will result in bank - conflicts. - This function will do nothing on devices with fixed shared memory bank size. - - The supported bank configurations are: - - : set bank width to the default initial - setting (currently, four bytes). - - : set shared memory bank width to - be natively four bytes. - - : set shared memory bank width to - be natively eight bytes. + Asynchron Copy host to device - requested shared memory configuration - CUDA Error Codes: , , , - , . + + - + - Returns numerical values that correspond to the least and greatest stream priorities. - Returns in leastPriority and greatestPriority the numerical values that correspond - to the least and greatest stream priorities respectively. Stream priorities - follow a convention where lower numbers imply greater priorities. The range of - meaningful stream priorities is given by [greatestPriority, leastPriority]. - If the user attempts to create a stream with a priority value that is - outside the meaningful range as specified by this API, the priority is - automatically clamped down or up to either leastPriority or greatestPriority - respectively. See ::cuStreamCreateWithPriority for details on creating a - priority stream. - A NULL may be passed in for leastPriority or greatestPriority if the value - is not desired. - This function will return '0' in both leastPriority and greatestPriority if - the current context's device does not support stream priorities - (see ::cuDeviceGetAttribute). + Asynchron copy device to host - Pointer to an int in which the numerical value for least - stream priority is returned - Pointer to an int in which the numerical value for greatest stream priority is returned - + + - + - Returns the flags for the current context - Returns in \p *flags the flags of the current context. See ::cuCtxCreate for flag values. + Asynchron copy device to host - Pointer to store flags of current context - + + - + - Retain the primary context on the GPU. - Retains the primary context on the device, creating it if necessary, - increasing its usage count. The caller must call - ::cuDevicePrimaryCtxRelease() when done using the context. - Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. - - Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will - also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is - set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active, non-primary, - context on the device. The function ::cuDeviceGetAttribute() can be used with - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the - device. The nvidia-smi tool can be used to set the compute mode for - devices. Documentation for nvidia-smi can be obtained by passing a - -h option to it. - - Please note that the primary context always supports pinned allocations. Other - flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag - Returned context handle of the new context - Device for which primary context is requested - + Device Pointer - + - Release the primary context on the GPU - Releases the primary context interop on the device by decreasing the usage - count by 1. If the usage drops to 0 the primary context of device \p dev - will be destroyed regardless of how many threads it is current to. - - Please note that unlike ::cuCtxDestroy() this method does not pop the context - from stack in any circumstances. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - Device which primary context is released - + - - - Set flags for the primary context - Sets the flags for the primary context on the device overwriting perviously - set ones. If the primary context is already created - ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. - - The three LSBs of the \p flags parameter can be used to control how the OS - thread, which owns the CUDA context at the time of an API call, interacts - with the OS scheduler when waiting for results from the GPU. Only one of - the scheduling flags can be set when creating a context. + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - Device for which the primary context flags are set - New flags for the device - - + - Get the state of the primary context - Returns in \p *flags the flags for the primary context of \p dev, and in - \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag - values. + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: float2 - Device to get primary context flags for - Pointer to store flags - Pointer to store context state; 0 = inactive, 1 = active - - + - Destroy all allocations and reset all state on the primary context - - Explicitly destroys and cleans up all resources associated with the current - device in the current process. - - Note that it is responsibility of the calling function to ensure that no - other module in the process is using the device any more. For that reason - it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. - However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() - even after resetting the device. + Creates a new CudaRegisteredHostMemory_float2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - Device for which primary context is destroyed - + must be page size aligned (4KBytes) + In elements - + - Combines all API calls for module management + For dispose - + - Takes a filename fname and loads the corresponding module module into the current context. The CUDA driver API - does not attempt to lazily allocate the resources needed by a module; if the memory for functions and data (constant - and global) needed by the module cannot be allocated, fails. The file should be a cubin file as output - by nvcc or a PTX file, either as output by nvcc or handwrtten. + Dispose - Returned module - Filename of module to load - CUDA Error Codes: , , , - , , , - , , , - . - Note that this function may also return error codes from previous, asynchronous launches. - + - Takes a byte[] as image and loads the corresponding module module into the current context. The byte array may be obtained - by mapping a cubin or PTX file, passing a cubin or PTX file as a null-terminated text string. - The byte[] is a replacement for the original pointer. + For IDisposable - Returned module - Module data to load - CUDA Error Codes: , , , - , , - , , - . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Takes a byte[] as image and loads the corresponding module module into the current context. The byte array may be obtained - by mapping a cubin or PTX file, passing a cubin or PTX file as a null-terminated text string. - Options are passed as an array via options and any corresponding parameters are passed - in optionValues. The number of total options is supplied via numOptions. Any outputs will be returned via - optionValues. Supported options are definen in . - The options values are currently passed in IntPtr-type and should then be cast into their real type. This might change in future. + Pointer to pinned host memory. - Returned module - Module data to load - Number of options - Options for JIT - Option values for JIT - CUDA Error Codes: , , , - , , - , , - . - Note that this function may also return error codes from previous, asynchronous launches. - + - Takes a byte[] as fatCubin and loads the corresponding module module into the current context. The byte[] - represents a fat binary object, which is a collection of different cubin files, all representing the same device code, but - compiled and optimized for different architectures. Prior to CUDA 4.0, there was no documented API for constructing and using - fat binary objects by programmers. Starting with CUDA 4.0, fat binary objects can be constructed by providing the -fatbin option to nvcc. - More information can be found in the nvcc document. + Size in bytes - Returned module - Fat binary to load - CUDA Error Codes: , , , - , , , - , , - . - Note that this function may also return error codes from previous, asynchronous launches. - + - Unloads a module hmod from the current context. + Size in elements - Module to unload - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in hfunc the handle of the function of name name located in module hmod. If no function of that name - exists, returns . + Returns register status - Returned function handle - Module to retrieve function from - Name of function to retrieve - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in dptr and bytes the base pointer and size of the global of name name located in module hmod. If no - variable of that name exists, returns . Both parameters dptr - and bytes are optional. If one of them is null, it is ignored. + Access array per element. - Returned global device pointer - Returned global size in bytes - Module to retrieve global from - Name of global to retrieve - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + index in elements + - + - Returns in pTexRef the handle of the texture reference of name name in the module hmod. If no texture reference - of that name exists, returns . This texture reference handle - should not be destroyed, since it will be destroyed when the module is unloaded. + Synchron copy host to 1D Array - Returned texture reference - Module to retrieve texture reference from - Name of texture reference to retrieve - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Returns in pSurfRef the handle of the surface reference of name name in the module hmod. If no surface reference - of that name exists, returns . + Synchron copy host to 1D Array - Returned surface reference - Module to retrieve surface reference from - Name of surface reference to retrieve - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Creates a pending JIT linker invocation. - If the call is successful, the caller owns the returned CUlinkState, which should eventually be destroyed with ::cuLinkDestroy. - The device code machine size (32 or 64 bit) will match the calling application. - Both linker and compiler options may be specified. Compiler options will be applied to inputs to this linker action which must - be compiled from PTX. The options ::CU_JIT_WALL_TIME, - ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES will accumulate data until the CUlinkState is destroyed. - optionValues must remain valid for the life of the CUlinkState if output options are used. No other references to inputs are maintained after this call returns. + Synchron copy host to 1D Array - Size of options arrays - Array of linker and compiler options - Array of option values, each cast to void * - On success, this will contain a CUlinkState to specify and complete this action - + - + - Add an input to a pending linker invocation. - Ownership of data data is retained by the caller. No reference is retained to any inputs after this call returns. - This method accepts only compiler options, which are used if the data must be compiled from PTX, and does not accept any of - ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + Synchron copy host to 1D Array - A pending linker action. - The type of the input data. - The input data. PTX must be NULL-terminated. - The length of the input data. - An optional name for this input in log messages. - Size of options. - Options to be applied only for this input (overrides options from ::cuLinkCreate). - Array of option values, each cast to void *. - + + - + - Add a file input to a pending linker invocation. - No reference is retained to any inputs after this call returns. - This method accepts only compiler options, which are used if the data must be compiled from PTX, and does not accept any of - ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - This method is equivalent to invoking ::cuLinkAddData on the contents of the file. + Synchron copy 1D Array to host - A pending linker action. - The type of the input data. - Path to the input file. - Size of options. - Options to be applied only for this input (overrides options from ::cuLinkCreate). - Array of option values, each cast to void *. - + + - + - Complete a pending linker invocation. - Completes the pending linker action and returns the cubin image for the linked - device code, which can be used with ::cuModuleLoadData. The cubin is owned by - state, so it should be loaded before state is destroyed via ::cuLinkDestroy. - This call does not destroy state. + Synchron copy 1D Array to host - A pending linker invocation - On success, this will point to the output image - Optional parameter to receive the size of the generated image - + - + - Destroys state for a JIT linker invocation. + Synchron copy 1D Array to host - State object for the linker invocation - + - + - Combines all API calls for memory management + Synchron copy 1D Array to host + + - + - Returns in free and total respectively, the free and total amount of memory available for allocation by the - CUDA context, in bytes. + Synchron copy host to device - Returned free memory in bytes - Returned total memory in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Allocates bytesize bytes of linear memory on the device and returns in dptr a pointer to the allocated memory. - The allocated memory is suitably aligned for any kind of variable. The memory is not cleared. If bytesize is 0, - returns . + Synchron copy host to device - Returned device pointer - Requested allocation size in bytes - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Allocates at least WidthInBytes * Height bytes of linear memory on the device and returns in dptr a pointer - to the allocated memory. The function may pad the allocation to ensure that corresponding pointers in any given - row will continue to meet the alignment requirements for coalescing as the address is updated from row to row. - ElementSizeBytes specifies the size of the largest reads and writes that will be performed on the memory range. - ElementSizeBytes may be 4, 8 or 16 (since coalesced memory transactions are not possible on other data sizes). If - ElementSizeBytes is smaller than the actual read/write size of a kernel, the kernel will run correctly, but possibly - at reduced speed. The pitch returned in pPitch by is the width in bytes of the allocation. The - intended usage of pitch is as a separate parameter of the allocation, used to compute addresses within the 2D array. - Given the row and column of an array element of type T, the address is computed as: - T * pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; - The pitch returned by is guaranteed to work with under all circumstances. For - allocations of 2D arrays, it is recommended that programmers consider performing pitch allocations using . - Due to alignment restrictions in the hardware, this is especially true if the application will be performing - 2D memory copies between different regions of device memory (whether linear memory or CUDA arrays). - The byte alignment of the pitch returned by is guaranteed to match or exceed the alignment - requirement for texture binding with . + Synchron copy device to host - Returned device pointer - Returned pitch of allocation in bytes - Requested allocation width in bytes - Requested allocation height in rows - Size of largest reads/writes for range - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Frees the memory space pointed to by dptr, which must have been returned by a previous call to or - . + Synchron copy device to host - Pointer to memory to free - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Returns the base address in pbase and size in psize of the allocation by or - that contains the input pointer dptr. Both parameters pbase and psize are optional. If one of them is null, it is - ignored. + Asynchron copy host to 1D Array - Returned base address - Returned size of device memory allocation - Device pointer to query - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Allocates bytesize bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual - memory ranges allocated with this function and automatically accelerates calls to functions such as . - Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than - pageable memory obtained with functions such as malloc(). Allocating excessive amounts of memory with - may degrade system performance, since it reduces the amount of memory available to the system for paging. - As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device. + Asynchron copy host to 1D Array - Returned host pointer to page-locked memory - Requested allocation size in bytes - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Frees the memory space pointed to by p, which must have been returned by a previous call to . + Asynchron copy host to 1D Array - Pointer to memory to free - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Allocates bytesize bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual - memory ranges allocated with this function and automatically accelerates calls to functions such as . - Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than - pageable memory obtained with functions such as malloc(). Allocating excessive amounts of pinned - memory may degrade system performance, since it reduces the amount of memory available to the system for paging. - As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device. - For the Flags parameter see . - The CUDA context must have been created with the flag in order for the - flag to have any effect. - The flag may be specified on CUDA contexts for devices that do not support - mapped pinned memory. The failure is deferred to because the memory may be - mapped into other CUDA contexts via the flag. - The memory allocated by this function must be freed with . - Note all host memory allocated using will automatically - be immediately accessible to all contexts on all devices which support unified - addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer - that may be used to access this host memory from those contexts is always equal - to the returned host pointer pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED - is specified, then the function must be used - to query the device pointer, even if the context supports unified addressing. - See \ref CUDA_UNIFIED for additional details. + Asynchron copy host to 1D Array - Returned host pointer to page-locked memory - Requested allocation size in bytes - Flags for allocation request - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Passes back the device pointer pdptr corresponding to the mapped, pinned host buffer p allocated by . - will fail if the flag was not specified at the - time the memory was allocated, or if the function is called on a GPU that does not support mapped pinned memory. - Flags provides for future releases. For now, it must be set to 0. + Asynchron copy 1D Array to host - Returned device pointer - Host pointer - Options (must be 0) - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Passes back the flags pFlags that were specified when allocating the pinned host buffer p allocated by - . - will fail if the pointer does not reside in an allocation performed by or - . + Asynchron copy 1D Array to host - Returned flags - Host pointer - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + Page-locks the memory range specified by p and bytesize and maps it for the device(s) as specified by Flags. This memory range also is added @@ -89211,1388 +102944,9249 @@ best used sparingly to register staging areas for data exchange between host and device. The pointer p and size bytesize must be aligned to the host page size (4 KB). - The memory page-locked by this function must be unregistered with + The memory page-locked by this function must be unregistered with - Host pointer to memory to page-lock - Size in bytes of the address range to page-lock - Flags for allocation request - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - - + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. - The base address must be the same one specified to . + The base address must be the same one specified to . - Host pointer to memory to page-lock - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: float3 - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer + Creates a new CudaRegisteredHostMemory_float3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + must be page size aligned (4KBytes) + In elements - + - Returns information about a pointer + For dispose - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer + Dispose - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer + For IDisposable - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. + - + - Returns information about a pointer + Pointer to pinned host memory. - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer + Size in bytes - Returned pointer attribute value - Pointer attribute to query - Pointer - CUDA Error Codes: , , , - , , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Prefetches memory to the specified destination device - Prefetches memory to the specified destination device. devPtr is the - base device pointer of the memory to be prefetched and dstDevice is the - destination device. count specifies the number of bytes to copy. hStream - is the stream in which the operation is enqueued. - - Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. - - If no physical memory has been allocated for this region, then this memory region - will be populated and mapped on the destination device. If there's insufficient - memory to prefetch the desired region, the Unified Memory driver may evict pages - belonging to other memory regions to make room. If there's no memory that can be - evicted, then the Unified Memory driver will prefetch less than what was requested. - - In the normal case, any mappings to the previous location of the migrated pages are - removed and mappings for the new location are only setup on the dstDevice. - The application can exercise finer control on these mappings using ::cudaMemAdvise. + Size in elements - Pointer to be prefetched - Size in bytes - Destination device to prefetch to - Stream to enqueue prefetch operation - Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: float4 + + + + + Creates a new CudaRegisteredHostMemory_float4 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: double + + + + + Creates a new CudaRegisteredHostMemory_double from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: double1 + + + + + Creates a new CudaRegisteredHostMemory_double1 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: double2 + + + + + Creates a new CudaRegisteredHostMemory_double2 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: cuDoubleComplex + + + + + Creates a new CudaRegisteredHostMemory_cuDoubleComplex from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: cuDoubleReal + + + + + Creates a new CudaRegisteredHostMemory_cuDoubleReal from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: cuFloatComplex + + + + + Creates a new CudaRegisteredHostMemory_cuFloatComplex from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: cuFloatReal + + + + + Creates a new CudaRegisteredHostMemory_cuFloatReal from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy. + cuMemHostRegister doesn't work with managed memory (e.g. normal C# arrays). But you can use cuMemHostRegister for + natively allocated memory (Marshal.AllocHGlobal, or a native dll). + Type: dim3 + + + + + Creates a new CudaRegisteredHostMemory_dim3 from an existing IntPtr. IntPtr must be page size aligned (4KBytes)! + + must be page size aligned (4KBytes) + In elements + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Pointer to pinned host memory. + + + + + Size in bytes + + + + + Size in elements + + + + + Returns register status + + + + + Access array per element. + + index in elements + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + Synchron copy host to 1D Array + + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + Synchron copy 1D Array to host + + + + + + + Synchron copy host to device + + + + + + Synchron copy host to device + + + + + + Synchron copy device to host + + + + + + Synchron copy device to host + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + Asynchron copy host to 1D Array + + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + Asynchron copy 1D Array to host + + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron Copy host to device + + + + + + + Asynchron copy device to host + + + + + + + Asynchron copy device to host + + + + + + + Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag + + Device Pointer + + + + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with + + + + + + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . + + + + + Cuda Surface Object + + + + + Creates a surface object. ResDesc describes + the data to perform surface load/stores on. ResDesc.resType must be + and ResDesc.hArray + must be set to a valid CUDA array handle. ResDesc.flags must be set to zero. + + CudaResourceDesc + + + + Creates a surface object. ResDesc describes + the data to perform surface load/stores on. ResDesc.resType must be + and ResDesc.hArray + must be set to a valid CUDA array handle. + + CudaArray1D + + + + Creates a surface object. ResDesc describes + the data to perform surface load/stores on. ResDesc.resType must be + and ResDesc.hArray + must be set to a valid CUDA array handle. + + CudaArray2D + + + + Creates a surface object. ResDesc describes + the data to perform surface load/stores on. ResDesc.resType must be + and ResDesc.hArray + must be set to a valid CUDA array handle. + + CudaArray3D + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the wrapped CUsurfObject + + + + + Returns the CudaResourceDesc used to create the CudaSurfObject + + + + + Cuda Texure Object + + + + + Creates a texture object and returns it in pTexObject. pResDesc describes the data to texture from. pTexDesc + describes how the data should be sampled. + + CudaResourceDesc + CudaTextureDescriptor + + + + Creates a texture object. ResDesc describes the data to texture from. TexDesc + describes how the data should be sampled. resViewDesc is an optional argument that specifies an alternate format + for the data described by pResDesc, and also describes the subresource region to restrict access to when texturing. + pResViewDesc can only be specified if the type of resource is a CUDA array or a CUDA mipmapped array. + + Describes the data to texture from. + Describes how the data should be sampled. + CudaResourceViewDesc. Only valid if type of resource is a CUDA array or a CUDA mipmapped array + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the wrapped CUtexObject + + + + + Returns the CudaResourceDesc used to create the CudaTexObject + + + + + Returns the CudaTextureDescriptor used to create the CudaTexObject + + + + + Returns the CudaResourceViewDesc used to create the CudaTexObject + + + + + Provides methods to bind texture references to kernels + + + + + Create a new CudaDeviceVariable and bind it to a texture reference. + + + + + + + In elements + + + + Bind a CudaDeviceVariable to a texture reference. + + + + + + + + + + + Create a new CudaPitchedDeviceVariable and bind it to a texture reference. + + + + + + + + In elements + In elements + + + + Create a new CudaPitchedDeviceVariable and bind it to a texture reference. + + + + + + + + + In elements + In elements + + + + Bind a CudaPitchedDeviceVariable to a texture reference. + + + + + + + + + + + + Bind a CudaPitchedDeviceVariable to a texture reference. + + + + + + + + + + + + + Create a new CudaArray1D and bind it to a texture reference. + + + + + + + + In elements + + + + + Bind a CudaArray1D to a texture reference. + + + + + + + + + + + Create a new CudaArray2D and bind it to a texture reference. + + + + + + + + In elements + In elements + 1,2 or 4 + + + + Create a new CudaArray2D and bind it to a texture reference. + + + + + + + + + In elements + In elements + 1,2 or 4 + + + + Bind a CudaArray2D to a texture reference. + + + + + + + + + + + Bind a CudaArray2D to a texture reference. + + + + + + + + + + + + Create a new CudaArray3D and bind it to a texture reference. + + + + + + + + In elements + In elements + In elements + 1,2 or 4 + + + + Create a new CudaArray3D and bind it to a texture reference. + + + + + + + + + + In elements + In elements + In elements + 1,2 or 4 + + + + Bind a CudaArray3D to a texture reference. + + + + + + + + + + + Bind a CudaArray3D to a texture reference. + + + + + + + + + + + + + Create a new CudaMipmappedArray and bind it to a texture reference. + + + + + + + + + + + + + + + + + Create a new CudaMipmappedArray and bind it to a texture reference. + + + + + + + + + + + + + + + + + + + Bind a CudaMipmappedArray to a texture reference. + + + + + + + + + + + + + + + + Bind a CudaMipmappedArray to a texture reference. + + + + + + + + + + + + + + + + + + Create a new CudaDeviceVariable and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + RGBA color + + + In elements + + + + Bind a CudaDeviceVariable to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + RGBA color + + + + Create a new CudaPitchedDeviceVariable and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + In elements + In elements + RGBA color + + + + Bind a CudaPitchedDeviceVariable to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + + RGBA color + + + + Create a new CudaArray1D and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + In elements + + RGBA color + + + + Bind a CudaArray1D to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + RGBA color + + + + Create a new CudaArray2D and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + In elements + In elements + 1,2 or 4 + RGBA color + + + + Bind a CudaArray2D to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + RGBA color + + + + Create a new CudaArray3D and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + In elements + In elements + In elements + 1,2 or 4 + RGBA color + + + + Bind a CudaArray3D to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + RGBA color + + + + Create a new CudaMipmappedArray and bind it to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + + + + + + + RGBA color + + + + Bind a CudaMipmappedArray to a texture reference. + Sets the border color for the texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + addressMode is set to CU_TR_ADDRESS_MODE_BORDER + + + + + + + + + + + + RGBA color + + + + CudaArrayTexture1D + + + + + Creates a new 1D texture from array memory. Allocates new array. + + + + + + + + In elements + + + + + Creates a new 1D texture from array memory + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + TextureReference + + + + + Flags + + + + + AddressMode + + + + + Format + + + + + Format + + + + + Size + + + + + ChannelSize + + + + + TotalSizeInBytes + + + + + NumChannels + + + + + Name + + + + + Module + + + + + CUFuntion + + + + + Array + + + + + CudaArrayTexture2D + + + + + Creates a new 2D texture from array memory. Allocates a new 2D array. + + + + + + + + In elements + In elements + 1,2 or 4 + + + + Creates a new 2D texture from array memory. Allocates a new 2D array. + + + + + + + + + In elements + In elements + 1,2 or 4 + + + + Creates a new 2D texture from array memory + + + + + + + + + + + Creates a new 2D texture from array memory + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + TextureReference + + + + + Flags + + + + + AddressMode + + + + + AddressMode + + + + + Format + + + + + Format + + + + + Height + + + + + Width + + + + + ChannelSize + + + + + TotalSizeInBytes + + + + + NumChannels + + + + + Name + + + + + Module + + + + + CUFuntion + + + + + Array + + + + + CudaArrayTexture3D + + + + + Creates a new 3D texture from array memory. Allocates a new 3D array. + + + + + + + + In elements + In elements + In elements + 1,2 or 4 + + + + Creates a new 3D texture from array memory. Allocates a new 3D array. + + + + + + + + + + In elements + In elements + In elements + 1,2 or 4 + + + + Creates a new 3D texture from array memory + + + + + + + + + + + Creates a new 3D texture from array memory + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + TextureReference + + + + + Flags + + + + + AddressMode + + + + + AddressMode + + + + + AddressMode + + + + + Format + + + + + Filtermode + + + + + Depth + + + + + Height + + + + + Width + + + + + ChannelSize + + + + + TotalSizeInBytes + + + + + NumChannels + + + + + Name + + + + + Module + + + + + CUFuntion + + + + + Array + + + + + A variable located in CUDA device memory + + variable base type + + + + Creates a new CudaDeviceVariable and allocates the memory on the device + + In elements + + + + Allocates memory with stream ordered semantics + Inserts an allocation operation into \p hStream. + A pointer to the allocated memory is returned immediately in *dptr. + The allocation must not be accessed until the the allocation operation completes. + The allocation comes from the memory pool current to the stream's device. + + note The default memory pool of a device contains device memory from that device. + note Basic stream ordering allows future work submitted into the same stream to use the allocation. + Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + operation completes before work submitted in a separate stream runs. + + In elements + + + + + Creates a new CudaDeviceVariable from an existing CUdeviceptr. The allocated size is gethered via the CUDA API. + devPtr won't be freed while disposing. + + + + + + Creates a new CudaDeviceVariable from an existing CUdeviceptr. The allocated size is gethered via the CUDA API. + + + The CUdeviceptr will be freed while disposing, if the CudaDeviceVariable is the owner + + + + Creates a new CudaDeviceVariable from an existing CUdeviceptr. + devPtr won't be freed while disposing. + + + Size in Bytes + + + + Creates a new CudaDeviceVariable from an existing CUdeviceptr. + + + The CUdeviceptr will be freed while disposing, if the CudaDeviceVariable is the owner + Size in Bytes + + + + Creates a new CudaDeviceVariable from definition in cu-file. + + The module where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaDeviceVariable from definition in cu-file. + + The kernel which module defines the variable. + The variable name as defined in the cu-file. + + + + Creates a new CudaDeviceVariable from definition in cu-file. + + The library where the variable is defined in. + The variable name as defined in the cu-file. + + + + Creates a new CudaDeviceVariable from definition in cu-file. + + The library that defines the variable. + The variable name as defined in the cu-file. + + + + For dispose + + + + + Dispose + + + + + Dispose Async: + Frees memory with stream ordered semantics + Inserts a free operation into \p hStream. + The allocation must not be accessed after stream execution reaches the free. + After this API returns, accessing the memory from any subsequent work launched on the GPU + or querying its pointer attributes results in undefined behavior. + + + + + For IDisposable + + + + + + Copy data from device to device memory + + Source pointer to device memory + + + + Copy data from device to device memory + + Source pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy data from device to device memory + + Source + + + + Copy data from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy from device to device memory + + Source + + + + Copy from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Width of 2D memory to copy in bytes + Height in elements + + + + Copy data from host to device memory + + Source pointer to host memory + + + + Copy data from host to device memory + + Source pointer to host memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy data from host to device memory + + Source pointer to host memory + + + + Copy data from host to device memory + + Source pointer to host memory + Offset to destination pointer in bytes + + + + Copy data from host to device memory + + Source pointer to host memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy data from host to device memory + + Source pointer to host memory + Offset to destination pointer in bytes + + + + Copy data from host to device memory + + Source pointer to host memory + + + + Copy from Host to device memory. Array elements can be of any (value)type, but total size in bytes must match to allocated device memory. + + Source + + + + Copy data from device to host memory + + Destination pointer to host memory + + + + Copy data from device to host memory + + Destination pointer to host memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy data from device to host memory + + Destination data in host memory + + + + Copy data from device to host memory + + Destination data in host memory + Offset to source pointer in bytes + + + + Copy data from device to host memory + + Destination pointer to host memory + + + + Copy data from device to host memory + + Destination data in host memory + Offset to source pointer in bytes + + + + Copy data from device to host memory + + Destination pointer to host memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + Copy data from this device to host memory. Array elements can be of any (value)type, but total size in bytes must match to allocated device memory. + + Destination + + + + Async Copy data from device to device memory + + Source pointer to device memory + + + + + Async Copy data from device to device memory + + Source + + + + + Async Copy from device to device memory + + Source + + + + + Async Copy data from device to device memory + + Source pointer to device memory + + + + + Async Copy data from device to device memory + + Source + + + + + Async Copy from device to device memory + + Source + + + + + Async Copy data from device to device memory + + Source pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + + Async Copy data from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + + Async Copy data from device to device memory + + Source pointer to device memory + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + + Async Copy data from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Size to copy in bytes + + + + + Async Copy from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Width of 2D memory to copy in bytes + Height in elements + + + + + Async Copy from device to device memory + + Source + Offset to source pointer in bytes + Offset to destination pointer in bytes + Width of 2D memory to copy in bytes + Height in elements + + + + + Memset + + + + + + Memset + + + + + + Memset + + + + + + Memset + + + + + + + Memset + + + + + + + Memset + + + + + + + Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + Async-Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + + Async-Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + + Export data to share a memory pool allocation between processes. + Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + The recipient process can import the allocation with the::cuMemPoolImportPointer api. + The data is not a handle and may be shared through any IPC mechanism. + + + + + Access array elements directly from host. + Each single access invokes a device to host or host to device copy. Access is therefor rather slow. + + index in elements + + + + + Device pointer + + + + + Size in bytes + + + + + Type size in bytes + + + + + Size in elements + + + + + If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing. + + + + + Converts a device variable to a host array + + device variable + newly allocated host array with values from device memory + + + + Converts a device variable to a host value. In case of multiple device values, only the first value is copied. + + device variable + newly allocated host variable with value from device memory + + + + Converts a host array to a newly allocated device variable. + + host array + newly allocated device variable with values from host memory + + + + Converts a host array to a newly allocated device variable. + + host array + newly allocated device variable with values from host memory + + + + Gets a null-pointer equivalent + + + + + A CUDA exception is thrown if a CUDA Driver API method call does not return + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Error name as returned by CUDA driver API + + + + + Error description as returned by CUDA driver API + + + + + Groupes several wrapped CUgraphicsResources together, so that the map() call to the CUDA API can be efficiently on all + resources together. + + + + + Creates a new CudaGraphicsInteropResourceCollection + + + + + For dispose + + + + + Returns the number of resources in the collection + + + + + Adds a new resource to the collection + + + + + + Removes all resources in the collection, an disposes every element. + + + + + Returns true, if the given resource is part of the collection + + + + + + + Throws NotImplementedException. + + + + + + + Removes a resource from the collection. The resource is not disposed. + + + + + + + Dispose + + + + + For IDisposable + + + + + + Returns the ICudaGraphicsInteropResource at index index. + + + + + + + Maps all graphics resources for access by CUDA. + The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource + was registered should not access any resources while they are mapped by CUDA. If an application does + so, the results are undefined. + This function provides the synchronization guarantee that any graphics calls issued before + will complete before any subsequent CUDA work issued in stream begins. + If any of the resources is presently mapped for access by CUDA then exception is thrown. + + + + + Maps all graphics resources for access by CUDA. + The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource + was registered should not access any resources while they are mapped by CUDA. If an application does + so, the results are undefined. + This function provides the synchronization guarantee that any graphics calls issued before + will complete before any subsequent CUDA work issued in stream begins. + If any of the resources is presently mapped for access by CUDA then exception is thrown. + + + + + + Maps all graphics resources for access by CUDA. + The resources may be accessed by CUDA until they are unmapped. The graphics API from which the resource + was registered should not access any resources while they are mapped by CUDA. If an application does + so, the results are undefined. + This function provides the synchronization guarantee that any graphics calls issued before + will complete before any subsequent CUDA work issued in stream begins. + If any of the resources is presently mapped for access by CUDA then exception is thrown. + + + + + Unmaps all graphics resources. + Once unmapped, the resources may not be accessed by CUDA until they are mapped again. + This function provides the synchronization guarantee that any CUDA work issued in stream before + will complete before any subsequently issued graphics work begins. + If any of the resources are not presently mapped for access by CUDA then exception is thrown. + + + + + + Helper methods used in the wrapper framework + + + + + Returns the number of channels used in textures depending on the given type. + + Type + Number of channels + + + + Returns the channel size of an CUDA array in bytes. + + Channel format + Size in bytes + + + + CudaLinearTexture2D + + + + + Creates a new 2D texture from linear memory. Allocates a new device variable + + + + + + + + In elements + In elements + + + + Creates a new 2D texture from linear memory. Allocates a new device variable + + + + + + + + + In elements + In elements + + + + Creates a new 2D texture from linear memory. + + + + + + + + + + + + Creates a new 2D texture from linear memory. + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + TextureReference + + + + + Flags + + + + + AddressMode + + + + + AddressMode + + + + + Format + + + + + Format + + + + + Height + + + + + Width + + + + + ChannelSize + + + + + TotalSizeInBytes + + + + + NumChannels + + + + + Name + + + + + Module + + + + + CUFunction + + + + + Device variable in linear Memory + + + + + Binds a linear address range to the texture reference. + Any previous address or CUDA array state associated with the texture reference is superseded by this function. + Any memory previously bound to the texture reference is unbound. + Size my differ to the previous bound variable, but type must be the same. + + New device variable to bind this texture reference to. + + + + A variable located in CUDA device memory. The data is aligned following + + variable base type + + + + Creates a new CudaPitchedDeviceVariable and allocates the memory on the device + + In elements + In elements + + + + Creates a new CudaPitchedDeviceVariable and allocates the memory on the device + + In elements + In elements + Group pack elements as one type. E.g. 4 floats in host code to one float4 in device code + + + + Creates a new CudaPitchedDeviceVariable from an existing CUdeviceptr + The CUdeviceptr won't be freed when disposing. + + + In elements + In elements + In bytes + + + + Creates a new CudaPitchedDeviceVariable from an existing CUdeviceptr + + + In elements + In elements + In bytes + The CUdeviceptr will be freed while disposing if the CudaPitchedDeviceVariable is the owner + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Copy from device to device memory + + Source + + + + Copy from device to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy from device to device memory + + Source + + + + Copy from device to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy from device to device memory + + Source + + + + Copy from device to device memory + + Source + Source pitch + + + + Copy from device to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy from Host to device memory + + Source + + + + Copy from Host to device memory + + Source + Width in bytes + Height in elements + + + + Copy from host to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy from Host to device memory + + Source + + + + Copy from Host to device memory + + Source + Width in elements + Height in elements + + + + Copy from host to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy from Host to device memory. Assumes that aHostDest has no additional line padding. + + Source + + + + Copy from host to device memory + + Source + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy data from device to host memory + + IntPtr to destination in host memory + + + + Copy data from device to host memory + + IntPtr to destination in host memory + Width in bytes + Height in elements + + + + Copy data from device to host memory + + Destination + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy data from device to host memory + + Destination + + + + Copy data from this device to host memory + + Destination + Width in elements + Height in elements + + + + Copy data from device to host memory + + Destination + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Copy data from device to host memory. Assumes that aHostDest has no additional line padding. + + Destination + + + + Copy data from device to host memory + + Destination + Source X in bytes + Source Y + Destination X in bytes + Destination Y + Width in bytes + Height in elements + Source pitch + Destination pitch + + + + Async Copy data from device to device memory + + Source pointer to device memory + + + + + Async Copy data from device to device memory + + Source + + + + + Async Copy from device to device memory + + Source + + + + + Async Copy data from device to device memory (1D Copy, copies destination pitch * height bytes data) + + Source pointer to device memory + + + + + Async Copy data from device to device memory (1D Copy, copies destination pitch * height bytes data) + + Source + + + + + Async Copy from device to device memory + + Source + + + + + Memset + + + + + + Memset + + + + + + Memset + + + + + + Memset + + + + + + + Memset + + + + + + + Memset + + + + + + + Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + Async-Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + + Async-Copies from device memory in one context to device memory in another context + + Destination context + Source pointer to device memory + Source context + + + + + Access array elements directly from host. + Each single access invokes a device to host or host to device copy. Access is therefor rather slow. + + X-index in elements + Y-index in elements + + + + + Device pointer + + + + + Width in elements + + + + + Width in bytes + + + + + Height in elements + + + + + Pitch in bytes + + + + + Total size in bytes (Pitch * Height) + + + + + Type size in bytes + + + + + Converts a device variable to a host array + + device variable + newly allocated host array with values from device memory + + + + Measures via CUDA events the timespan between Start() and Stop() calls. + + + + + + + + + + + + + + + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + Start measurement + + + + + Stop measurement + + + + + Get elapsed time in milliseconds, sync on stop event + + Elapsed time in ms + + + + Get elapsed time in milliseconds, no sync on stop event + + Elapsed time in ms + + + + Returns the inner start event + + + + + Returns the inner stop event + + + + + Returns the inner stream + + + + + Wrapps a CUstream handle. + In case of a so called NULL stream, use the native CUstream struct instead. + + + + + Creates a new Stream using + + + + + Creates a new wrapper for an existing stream + + + + + Creates a new wrapper for an existing stream + + + + + Creates a new Stream + + Parameters for stream creation (must be ) + + + + Creates a new Stream using and with the given priority + This API alters the scheduler priority of work in the stream. Work in a higher priority stream + may preempt work already executing in a low priority stream. + priority follows a convention where lower numbers represent higher priorities. + '0' represents default priority. + + Stream priority. Lower numbers represent higher priorities. + + + + Creates a new Stream using and with the given priority + This API alters the scheduler priority of work in the stream. Work in a higher priority stream + may preempt work already executing in a low priority stream. + priority follows a convention where lower numbers represent higher priorities. + '0' represents default priority. + + Stream priority. Lower numbers represent higher priorities. + Parameters for stream creation (must be ) + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + returns the wrapped CUstream handle + + + + + Returns the unique Id associated with the stream handle + + + + + Waits until the device has completed all operations in the stream. If the context was created + with the flag, the CPU thread will block until the stream is finished with all of its + tasks. + + + + + Returns true if all operations in the stream have completed, or + false if not. + + + + + + Make a compute stream wait on an event + Makes all future work submitted to the Stream wait until hEvent + reports completion before beginning execution. This synchronization + will be performed efficiently on the device. + + The stream will wait only for the completion of the most recent + host call to on hEvent. Once this call has returned, + any functions (including and may be + called on hEvent again, and the subsequent calls will not have any + effect on this stream. + + If hStream is 0 (the NULL stream) any future work submitted in any stream + will wait for hEvent to complete before beginning execution. This + effectively creates a barrier for all future work submitted to the context. + + If has not been called on hEvent, this call acts as if + the record has already completed, and so is a functional no-op. + + + + + + Adds a callback to be called on the host after all currently enqueued + items in the stream have completed. For each + cuStreamAddCallback call, the callback will be executed exactly once. + The callback will block later work in the stream until it is finished. + + The callback may be passed or an error code. In the event + of a device error, all subsequently executed callbacks will receive an + appropriate . + + Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + will result in . Callbacks must not perform any + synchronization that may depend on outstanding device work or other callbacks + that are not mandated to run earlier. Callbacks without a mandated order + (in independent streams) execute in undefined order and may be serialized. + + This API requires compute capability 1.1 or greater. See + cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute + capability. Attempting to use this API with earlier compute versions will + return . + + The function to call once preceding stream operations are complete + User specified data to be passed to the callback function. Use GCAlloc to pin a managed object + Callback flags (must be CUStreamAddCallbackFlags.None) + + + + Here the Stream is the NULL stream + Adds a callback to be called on the host after all currently enqueued + items in the stream have completed. For each + cuStreamAddCallback call, the callback will be executed exactly once. + The callback will block later work in the stream until it is finished. + + The callback may be passed or an error code. In the event + of a device error, all subsequently executed callbacks will receive an + appropriate . + + Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + will result in . Callbacks must not perform any + synchronization that may depend on outstanding device work or other callbacks + that are not mandated to run earlier. Callbacks without a mandated order + (in independent streams) execute in undefined order and may be serialized. + + This API requires compute capability 1.1 or greater. See + cuDeviceGetAttribute or ::cuDeviceGetProperties to query compute + capability. Attempting to use this API with earlier compute versions will + return . + + The function to call once preceding stream operations are complete + User specified data to be passed to the callback function. Use GCAlloc to pin a managed object + Callback flags (must be CUStreamAddCallbackFlags.None) + + + + Query the priority of this stream + + the stream's priority + + + + Query the flags of this stream. + + the stream's flags + The value returned in flags is a logical 'OR' of all flags that + were used while creating this stream. + + + + Returns the device handle of the stream + + + + + Wait on a memory location + Enqueues a synchronization of the stream on the given memory location. Work + ordered after the operation will block until the given condition on the + memory is satisfied. By default, the condition is to wait for (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + + Other condition types can be specified via \p flags. + + If the memory was registered via ::cuMemHostRegister(), the device pointer + should be obtained with::cuMemHostGetDevicePointer(). This function cannot + be used with managed memory(::cuMemAllocManaged). + + Support for this can be queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic + support is that on Windows, a device must be in TCC mode. + + The memory location to wait on. + The value to compare with the memory location. + See::CUstreamWaitValue_flags. + + + + Wait on a memory location + Enqueues a synchronization of the stream on the given memory location. Work + ordered after the operation will block until the given condition on the + memory is satisfied. By default, the condition is to wait for (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + + Other condition types can be specified via \p flags. + + If the memory was registered via ::cuMemHostRegister(), the device pointer + should be obtained with::cuMemHostGetDevicePointer(). This function cannot + be used with managed memory(::cuMemAllocManaged). + + Support for this can be queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The requirements are + compute capability 7.0 or greater, and on Windows, that the device be in + TCC mode. + + The memory location to wait on. + The value to compare with the memory location. + See::CUstreamWaitValue_flags. + + + + Write a value to memory + + Write a value to memory.Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + flag is passed, the write is preceded by a system-wide memory fence, + equivalent to a __threadfence_system() but scoped to the stream + rather than a CUDA thread. + + If the memory was registered via ::cuMemHostRegister(), the device pointer + should be obtained with::cuMemHostGetDevicePointer(). This function cannot + be used with managed memory(::cuMemAllocManaged). + + Support for this can be queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic + support is that on Windows, a device must be in TCC mode. + + The device address to write to. + The value to write. + See::CUstreamWriteValue_flags. + + + + Write a value to memory + + Write a value to memory.Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + flag is passed, the write is preceded by a system-wide memory fence, + equivalent to a __threadfence_system() but scoped to the stream + rather than a CUDA thread. + + If the memory was registered via ::cuMemHostRegister(), the device pointer + should be obtained with::cuMemHostGetDevicePointer(). This function cannot + be used with managed memory(::cuMemAllocManaged). + + Support for this can be queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The requirements are + compute capability 7.0 or greater, and on Windows, that the device be in + TCC mode. + + The device address to write to. + The value to write. + See::CUstreamWriteValue_flags. + + + + Copies attributes from source stream to destination stream + Copies attributes from source stream \p src to destination stream \p dst. + Both streams must have the same context. + + Destination stream + + + + Queries stream attribute. + Queries attribute \p attr from \p hStream and stores it in corresponding member of \p value_out. + + + + + + Sets stream attribute. + Sets attribute \p attr on \p hStream from corresponding attribute of + value.The updated attribute will be applied to subsequent work + submitted to the stream. It will not affect previously submitted work. + + + + + + + Query the green context associated with a stream + + Returns the CUDA green context that the stream is associated with, or NULL if the stream + is not associated with any green context. + + The stream handle \p hStream can refer to any of the following: + + - a stream created via any of the CUDA driver APIs such as ::cuStreamCreate. + If during stream creation the context that was active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned in \p phCtx. + Otherwise, \p *phCtx is set to NULL instead. + + - special stream such as the NULL stream or ::CU_STREAM_LEGACY. + In that case if context that is active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned. + Otherwise, \p *phCtx is set to NULL instead. + + Passing an invalid handle will result in undefined behavior. + To query the device associated with the context, it is necessary that the currently bound context runs on the same device as the context to query! + + + + + CudaLinearTexture1D + + + + + Creates a new 1D texture from linear memory. Allocates a new device variable + + + + + + + In elements + + + + Creates a new 1D texture from linear memory. + + + + + + + + + + + For dispose + + + + + Dispose + + + + + For IDisposable + + + + + + TextureReference + + + + + Flags + + + + + AddressMode + + + + + Format + + + + + Filtermode + + + + + Size + + + + + ChannelSize + + + + + TotalSizeInBytes + + + + + NumChannels + + + + + Name + + + + + Module + + + + + CUFunction + + + + + Device variable in linear Memory + + + + + Binds a linear address range to the texture reference. + Any previous address or CUDA array state associated with the texture reference is superseded by this function. + Any memory previously bound to the texture reference is unbound. + Size my differ to the previous bound variable, but type must be the same. + + New device variable to bind this texture reference to. + + + + CUDA device properties + + + + + Typical clock frequency in kilohertz + + + + + Maximum block dimensions + + + + + Maximum grid dimensions + + + + + Maximum number of threads per block + + + + + Maximum pitch in bytes allowed by memory copies + + + + + Maximum number of 32-bit registers available per block + + + + + Maximum shared memory available per block in bytes + + + + + Alignment requirement for textures + + + + + Memory available on device for __constant__ variables in a CUDA C kernel in bytes + + + + + Name of the device + + + + + Driver version + + + + + Total amount of global memory on the device + + + + + Number of multiprocessors on device + + + + + Warp size in threads (also called SIMDWith) + + + + + Device can possibly copy memory and execute a kernel concurrently + + + + + Specifies whether there is a run time limit on kernels + + + + + Device is integrated with host memory + + + + + Device can map host memory into CUDA address space + + + + + Compute mode (See CUComputeMode for details) + + + + + Maximum 1D texture width + + + + + Maximum 2D texture width + + + + + Maximum 2D texture height + + + + + Maximum 3D texture width + + + + + Maximum 3D texture height + + + + + Maximum 3D texture depth + + + + + Maximum texture array width + + + + + Maximum texture array height + + + + + Maximum slices in a texture array + + + + + Alignment requirement for surfaces + + + + + Device can possibly execute multiple kernels concurrently + + + + + Device has ECC support enabled + + + + + PCI bus ID of the device + + + + + PCI device ID of the device + + + + + Device is using TCC driver model + + + + + Peak memory clock frequency in kilohertz + + + + + Global memory bus width in bits + + + + + Size of L2 cache in bytes + + + + + Maximum resident threads per multiprocessor + + + + + Number of asynchronous engines + + + + + Device shares a unified address space with the host + + + + + Maximum 1D layered texture width + + + + + Maximum layers in a 1D layered texture + + + + + PCI domain ID of the device + + + + + Pitch alignment requirement for textures + + + + + Maximum cubemap texture width/height + + + + + Maximum cubemap layered texture width/height + + + + + Maximum layers in a cubemap layered texture + + + + + Maximum 1D surface width + + + + + Maximum 2D surface width + + + + + Maximum 2D surface height + + + + + Maximum 3D surface width + + + + + Maximum 3D surface height + + + + + Maximum 3D surface depth + + + + + Maximum 1D layered surface width + + + + + Maximum layers in a 1D layered surface + + + + + Maximum 2D layered surface width + + + + + Maximum 2D layered surface height + + + + + Maximum layers in a 2D layered surface + + + + + Maximum cubemap surface width + + + + + Maximum cubemap layered surface width + + + + + Maximum layers in a cubemap layered surface + + + + + Maximum 1D linear texture width + + + + + Maximum 2D linear texture width + + + + + Maximum 2D linear texture height + + + + + Maximum 2D linear texture pitch in bytes + + + + + Maximum mipmapped 2D texture width + + + + + Maximum mipmapped 2D texture height + + + + + Major compute capability version number + + + + + Minor compute capability version number + + + + + Compute capability version number + + + + + Maximum mipmapped 1D texture width + + + + + Device supports stream priorities + + + + + Device supports caching globals in L1 + + + + + Device supports caching locals in L1 + + + + + Maximum shared memory available per multiprocessor in bytes + + + + + Maximum number of 32-bit registers available per multiprocessor + + + + + Device can allocate managed memory on this system + + + + + Device is on a multi-GPU board + + + + + Unique id for a group of devices on the same multi-GPU board + + + + + Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware) + + + + + Ratio of single precision performance (in floating-point operations per second) to double precision performance + + + + + Device supports coherently accessing pageable memory without calling cudaHostRegister on it + + + + + Device can coherently access managed memory concurrently with the CPU + + + + + Device supports compute preemption. + + + + + Device can access host registered memory at the same virtual address as the CPU. + + + + + cuStreamBatchMemOp and related APIs are supported. + + + + + 64-bit operations are supported in ::cuStreamBatchMemOp and related MemOp APIs. + + + + + ::CU_STREAM_WAIT_VALUE_NOR is supported by MemOp APIs. + + + + + Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel + + + + + Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice + + + + + Maximum optin shared memory per block + + + + + Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. + + + + + Device supports host memory registration via ::cudaHostRegister. + + + + + Device accesses pageable memory via the host's page tables. + + + + + The host can directly access managed memory on the device without migration. + + + + + Device supports virtual address management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs + + + + + Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + + + + + Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + + + + + Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate + + + + + Maximum number of blocks per multiprocessor + + + + + Device supports compression of memory + + + + + Device's maximum L2 persisting lines capacity setting in bytes + + + + + The maximum value of CUaccessPolicyWindow::num_bytes. + + + + + Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate + + + + + Shared memory reserved by CUDA driver per block in bytes + + + + + Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays + + + + + Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU + + + + + Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) + + + + + The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum + + + + + GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. + + + + + Handle types supported with mempool based IPC + + + + + Indicates device supports cluster launch + + + + + Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays + + + + + Device supports buffer sharing with dma_buf mechanism. + + + + + Device supports IPC Events. + + + + + Number of memory domains the device supports. + + + + + Device supports accessing memory using Tensor Map. + + + + + Device supports exporting memory to a fabric handle with cuMemExportToShareableHandle() or requested with cuMemCreate() + + + + + Device supports unified function pointers. + + + + + Device supports switch multicast and reduction operations. + + + + + Indicates if contexts created on this device will be shared via MPS + + + + + + + + + + + + + + + NUMA ID of the host node closest to the device. Returns -1 when system does not support NUMA. + + + + + Device supports CIG with D3D12. + + + + + The returned valued shall be interpreted as a bitmask, where the individual bits are described by the ::CUmemDecompressAlgorithm enum. + + + + + The returned valued is the maximum length in bytes of a single decompress operation that is allowed. + + + + + The combined 16-bit PCI device ID and 16-bit PCI vendor ID. + + + + + The combined 16-bit PCI subsystem ID and 16-bit PCI subsystem vendor ID. + + + + + Device supports HOST_NUMA location IPC between nodes in a multi-node system. + + + + + Direct3D 9 Interoperability + + + + + Direct3D9 Interoperability for CUDA 3.x + + + + + Returns in pCudaDevice the CUDA-compatible device corresponding to the adapter name pszAdapterName + obtained from EnumDisplayDevices() or IDirect3D9::GetAdapterIdentifier(). + If no device on the adapter with name pszAdapterName is CUDA-compatible, then the call will fail. + + Returned CUDA device corresponding to pszAdapterName + Adapter name to query for device + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Gets the CUDA devices corresponding to a Direct3D 9 device + Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding + to the Direct3D 9 device pD3D9Device. + Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices + corresponding to the Direct3D 9 device pD3D9Device. + + If any of the GPUs being used to render pDevice are not CUDA capable then the + call will return . + + Returned number of CUDA devices corresponding to pD3D9Device + Returned CUDA devices corresponding to pD3D9Device + The size of the output device array pCudaDevices + Direct3D 9 device to query for CUDA devices + The set of devices to return. + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in + pCudaDevice. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Returned pointer to the device on which the context was created + Context creation flags (see for details) + Direct3D device to create interoperability context with + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Context creation flags (see for details) + Direct3D device to create interoperability context with + Returned pointer to the device on which the context was created + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Registers the Direct3D 9 resource pD3DResource for access by CUDA and returns a CUDA handle to + pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and + unmap this resource until it is unregistered. On success this call will increase the internal reference count on + pD3DResource. This reference count will be decremented when this resource is unregistered through . + This call is potentially high-overhead and should not be called every frame in interactive applications. + The type of pD3DResource must be one of the following: + + Type of pD3DResourceRestriction + IDirect3DVertexBuffer9 + May be accessed through a device pointer. + + IDirect3DIndexBuffer9 + May be accessed through a device pointer. + + IDirect3DSurface9 + May be accessed through an array. Only stand-alone objects of type IDirect3DSurface9 + may be explicitly shared. In particular, individual mipmap levels and faces of cube maps may not be registered + directly. To access individual surfaces associated with a texture, one must register the base texture object. + + IDirect3DBaseTexture9 + Individual surfaces on this texture may be accessed through an array. + + + The Flags argument may be used to specify additional parameters at register time. The only valid value for this + parameter is . + Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some + limitations. + • The primary rendertarget may not be registered with CUDA. + • Resources allocated as shared may not be registered with CUDA. + • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data + cannot be shared. + • Surfaces of depth or stencil formats cannot be shared. + If Direct3D interoperability is not initialized for this context using then + is returned. If pD3DResource is of incorrect type or is already registered then + is returned. If pD3DResource cannot be registered then + is returned. If Flags is not one of the above specified value then + is returned. + + Returned graphics resource handle + Direct3D resource to register + Parameters for resource registration + CUDA Error Codes: , , , + , , , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns in ppD3DDevice the Direct3D device against which this CUDA context + was created in . + + Returned Direct3D device corresponding to CUDA context + CUDA Error Codes: , , , + . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Direct3D 10 Interoperability + + + + + Direct3D10 Interoperability for CUDA 3.x + + + + + Returns in device the CUDA-compatible device corresponding to the adapter pAdapter obtained from + IDXGIFactory::EnumAdapters. This call will succeed only if a device on adapter pAdapter is Cuda-compatible. + + Returned CUDA device corresponding to pszAdapterName + Adapter (type: IDXGIAdapter) + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Gets the CUDA devices corresponding to a Direct3D 10 device + Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding + to the Direct3D 10 device pD3D10Device. + Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices + corresponding to the Direct3D 10 device pD3D10Device. + + If any of the GPUs being used to render pDevice are not CUDA capable then the + call will return . + + Returned number of CUDA devices corresponding to pD3D9Device + Returned CUDA devices corresponding to pD3D9Device + The size of the output device array pCudaDevices + Direct3D 10 device to query for CUDA devices + The set of devices to return. + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in + pCudaDevice. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Returned pointer to the device on which the context was created + Context creation flags (see for details) + Direct3D device to create interoperability context with + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Context creation flags (see for details) + Direct3D device to create interoperability context with + Returned pointer to the device on which the context was created + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Registers the Direct3D 10 resource pD3DResource for access by CUDA and returns a CUDA handle to + pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and + unmap this resource until it is unregistered. On success this call will increase the internal reference count on + pD3DResource. This reference count will be decremented when this resource is unregistered through . + This call is potentially high-overhead and should not be called every frame in interactive applications. + The type of pD3DResource must be one of the following: + + Type of pD3DResourceRestriction + ID3D10Buffer + May be accessed through a device pointer. + + ID3D10Texture1D + Individual subresources of the texture may be accessed via arrays. + + ID3D10Texture2D + Individual subresources of the texture may be accessed via arrays. + + ID3D10Texture3D + Individual subresources of the texture may be accessed via arrays. + + + The Flags argument may be used to specify additional parameters at register time. The only valid value for this + parameter is . + Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some + limitations. + • The primary rendertarget may not be registered with CUDA. + • Resources allocated as shared may not be registered with CUDA. + • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data + cannot be shared. + • Surfaces of depth or stencil formats cannot be shared. + If Direct3D interoperability is not initialized for this context using then + is returned. If pD3DResource is of incorrect type or is already registered then + is returned. If pD3DResource cannot be registered then + is returned. If Flags is not one of the above specified value then + is returned. + + Returned graphics resource handle + Direct3D resource to register + Parameters for resource registration + CUDA Error Codes: , , , + , , , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns in ppD3DDevice the Direct3D device against which this CUDA context + was created in . + + Returned Direct3D device corresponding to CUDA context + CUDA Error Codes: , , , + . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Direct3D 11 Interoperability for CUDA 3.x + + + + + Returns in device the CUDA-compatible device corresponding to the adapter pAdapter obtained from + IDXGIFactory::EnumAdapters. This call will succeed only if a device on adapter pAdapter is Cuda-compatible. + + Returned CUDA device corresponding to pszAdapterName + Adapter (type: IDXGIAdapter) + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Gets the CUDA devices corresponding to a Direct3D 11 device + Returns in pCudaDeviceCount the number of CUDA-compatible device corresponding + to the Direct3D 11 device pD3D11Device. + Also returns in pCudaDevices at most cudaDeviceCount of the the CUDA-compatible devices + corresponding to the Direct3D 11 device pD3D11Device. + + If any of the GPUs being used to render pDevice are not CUDA capable then the + call will return . + + Returned number of CUDA devices corresponding to pD3D9Device + Returned CUDA devices corresponding to pD3D11Device + The size of the output device array pCudaDevices + Direct3D 11 device to query for CUDA devices + The set of devices to return. + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + If pCudaDevice is non-NULL then the on which this CUDA context was created will be returned in + pCudaDevice. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Returned pointer to the device on which the context was created + Context creation flags (see for details) + Direct3D device to create interoperability context with + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + - Advise about the usage of a given memory range - Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + Creates a new CUDA context, enables interoperability for that context with the Direct3D device pD3DDevice, and + associates the created CUDA context with the calling thread. The created will be returned in pCtx. + Direct3D resources from this device may be registered and mapped through the lifetime of this CUDA context. + On success, this call will increase the internal reference count on pD3DDevice. This reference count will be decremented + upon destruction of this context through . This context will cease to function if pD3DDevice + is destroyed or encounters an error. + + Returned newly created CUDA context + Context creation flags (see for details) + Direct3D device to create interoperability context with + Returned pointer to the device on which the context was created + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Registers the Direct3D 11 resource pD3DResource for access by CUDA and returns a CUDA handle to + pD3Dresource in pCudaResource. The handle returned in pCudaResource may be used to map and + unmap this resource until it is unregistered. On success this call will increase the internal reference count on + pD3DResource. This reference count will be decremented when this resource is unregistered through . + This call is potentially high-overhead and should not be called every frame in interactive applications. + The type of pD3DResource must be one of the following: + + Type of pD3DResourceRestriction + ID3D11Buffer + May be accessed through a device pointer. + + ID3D11Texture1D + Individual subresources of the texture may be accessed via arrays. + + ID3D11Texture2D + Individual subresources of the texture may be accessed via arrays. + + ID3D11Texture3D + Individual subresources of the texture may be accessed via arrays. + + + The Flags argument may be used to specify additional parameters at register time. The only valid value for this + parameter is . + Not all Direct3D resources of the above types may be used for interoperability with CUDA. The following are some + limitations. + • The primary rendertarget may not be registered with CUDA. + • Resources allocated as shared may not be registered with CUDA. + • Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, or 32-bit integer or floating-point data + cannot be shared. + • Surfaces of depth or stencil formats cannot be shared. + If Direct3D interoperability is not initialized for this context using then + is returned. If pD3DResource is of incorrect type or is already registered then + is returned. If pD3DResource cannot be registered then + is returned. If Flags is not one of the above specified value then + is returned. + + Returned graphics resource handle + Direct3D resource to register + Parameters for resource registration + CUDA Error Codes: , , , + , , , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns in ppD3DDevice the Direct3D device against which this CUDA context + was created in . + + Returned Direct3D device corresponding to CUDA context + CUDA Error Codes: , , , + . + Note that this function may also return error codes from previous, asynchronous launches. + + + + C# wrapper for the NVIDIA CUDA Driver API (--> cuda.h) + + + + + Gives the version of the wrapped api + + + + + Initializes the driver API and must be called before any other function from the driver API. Currently, + the Flags parameter must be . If has not been called, any function from the driver API will return + . + + Before any call to the CUDA Driver API can be done, the API must be initialized with cuInit(0). + Currently, Flags must always be . + CUDA Error Codes: , , .Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns in driverVersion the version number of the installed CUDA driver. This function automatically returns + if the driverVersion argument is NULL. + + Returns the CUDA driver version + CUDA Error Codes: , .Note that this function may also return error codes from previous, asynchronous launches. + + + + Combines all API calls for device management + + + + + Returns in device a device handle given an ordinal in the range [0, -1]. + + Returned device handle + Device number to get handle for + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns in count the number of devices with compute capability greater than or equal to 2.0 that are available for + execution. If there is no such device, returns 0. + + Returned number of compute-capable devices + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns an ASCII string identifying the device dev in the NULL-terminated string pointed to by name. len specifies + the maximum length of the string that may be returned. + + Returned identifier string for the device + Maximum length of string to store in name + Device to get identifier string for + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Return an UUID for the device + Returns 16-octets identifing the device \p dev in the structure pointed by the \p uuid. + + Returned UUID + Device to get identifier string for + + + + + Return an UUID for the device (11.4+) + Returns 16-octets identifing the device \p dev in the structure + pointed by the \p uuid.If the device is in MIG mode, returns its + MIG UUID which uniquely identifies the subscribed MIG compute instance. + Returns 16-octets identifing the device \p dev in the structure pointed by the \p uuid. + + Returned UUID + Device to get identifier string for + + + + + Return an LUID and device node mask for the device. + Return identifying information (\p luid and \p deviceNodeMask) to allow + matching device with graphics APIs. + + Returned LUID + Returned device node mask + Device to get identifier string for + + + + + Returns in bytes the total amount of memory available on the device dev in bytes. + + Returned memory available on device in bytes + Device handle + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. + Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture + for given \p format and \p numChannels. + + Returned maximum number of texture elements allocatable for given \p format and \p numChannels. + Texture format. + Number of channels per texture element. + Device handle. + + + + + Returns in pi the integer value of the attribute attrib on device dev. See . + + Returned device attribute value + Device attribute to query + Device handle + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. + + + + Return NvSciSync attributes that this device can support. + Returns in \p nvSciSyncAttrList, the properties of NvSciSync that + this CUDA device, \p dev can support.The returned \p nvSciSyncAttrList + can be used to create an NvSciSync object that matches this device’s capabilities. - The \p advice parameter can take the following values: - - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read - from and only occasionally written to. This allows the driver to create read-only - copies of the data in a processor's memory when that processor accesses it. Similarly, - if cuMemPrefetchAsync is called on this region, it will create a read-only copy of - the data on the destination processor. When a processor writes to this data, all copies - of the corresponding page are invalidated except for the one where the write occurred. - The \p device argument is ignored for this advice. - - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read - duplicated copies of the data will be freed no later than the next write access to that data. - - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the - data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the - preferred location as CPU memory. Setting the preferred location does not cause data to - migrate to that location immediately. Instead, it guides the migration policy when a fault - occurs on that memory region. If the data is already in its preferred location and the - faulting processor can establish a mapping without requiring the data to be migrated, then - the migration will be avoided. On the other hand, if the data is not in its preferred location - or if a direct mapping cannot be established, then it will be migrated to the processor accessing - it. It is important to note that setting the preferred location does not prevent data prefetching - done using ::cuMemPrefetchAsync. - Having a preferred location can override the thrash detection and resolution logic in the Unified - Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU - memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But - if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. - When the Unified Memory driver has to evict pages from a certain location on account of that - memory being oversubscribed, the preferred location will be used to decide the destination to which - a page should be evicted to. - If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred - location will be ignored for that subset. - - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION - and changes the preferred location to none. - - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. - This does not cause data migration and has no impact on the location of the data per se. Instead, - it causes the data to always be mapped in the specified processor's page tables, as long as the - location of the data permits a mapping to be established. If the data gets migrated for any reason, - the mappings are updated accordingly. - This advice is useful in scenarios where data locality is not important, but avoiding faults is. - Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data - over to the other GPUs is not as important because the accesses are infrequent and the overhead of - migration may be too high. But preventing faults can still help improve performance, and so having - a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the - ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the - page in CPU memory. - - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of - mappings may be removed at any time causing accesses to result in page faults. + If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is + already set this API will return ::CUDA_ERROR_INVALID_VALUE. - Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + The applications should set \p nvSciSyncAttrList to a valid + NvSciSyncAttrList failing which this API will return + ::CUDA_ERROR_INVALID_HANDLE. - Note that this function is asynchronous with respect to the host and all work - on other devices. + The \p flags controls how applications intends to use + the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: + - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to + signal an NvSciSync on this CUDA device. + - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to + wait on an NvSciSync on this CUDA device. + + At least one of these flags must be set, failing which the API + returns::CUDA_ERROR_INVALID_VALUE.Both the flags are orthogonal + to one another: a developer may set both these flags that allows to + set both wait and signal specific attributes in the same \p nvSciSyncAttrList. - Pointer to memory to set the advice for - Size in bytes of the memory range - Advice to be applied for the specified memory range - Device to apply the advice for + Return NvSciSync attributes supported + Valid Cuda Device to get NvSciSync attributes for. + flags describing NvSciSync usage. + + + + Sets the current memory pool of a device + The memory pool must be local to the specified device. + ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. + By default, a device's current memory pool is its default memory pool. + + note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different than the one the stream runs on. + + + - + - Query an attribute of a given memory range + Gets the current mempool for a device + Returns the last pool provided to ::cuDeviceSetMemPool for this device + or the device's default memory pool if ::cuDeviceSetMemPool has never been called. + By default the current mempool is the default mempool for a device. + Otherwise the returned pool must have been set with::cuDeviceSetMemPool. - A pointers to a memory location where the result of each attribute query will be written to. - Array containing the size of data - The attribute to query - Start of the range to query - Size of the range to query + + + - + - Query attributes of a given memory range. + Returns the default mempool of a device + The default mempool of a device contains device memory from that device. - A two-dimensional array containing pointers to memory locations where the result of each attribute query will be written to. - Array containing the sizes of each result - An array of attributes to query (numAttributes and the number of attributes in this array should match) - Number of attributes to query - Start of the range to query - Size of the range to query + + + - + - Allocates memory that will be automatically managed by the Unified Memory system + Returns in device a device handle given a PCI bus ID string. + + Returned device handle + String in one of the following forms: + [domain]:[bus]:[device].[function] + [domain]:[bus]:[device] + [bus]:[device].[function] + where domain, bus, device, and function are all hexadecimal values + CUDA Error Codes: , , , + , . + + + + Returns an ASCII string identifying the device dev in the NULL-terminated + string pointed to by pciBusId. len specifies the maximum length of the + string that may be returned. + + Returned identifier string for the device in the following format + [domain]:[bus]:[device].[function] + where domain, bus, device, and function are all hexadecimal values. + pciBusId should be large enough to store 13 characters including the NULL-terminator. + Maximum length of string to store in name + Device to get identifier string for + CUDA Error Codes: , , , + , . + + + + Takes as input a previously allocated event. This event must have been + created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + flags set. This opaque handle may be copied into other processes and + opened with ::cuIpcOpenEventHandle to allow efficient hardware + synchronization between GPU work in different processes. - Allocates bytesize bytes of managed memory on the device and returns in - dptr a pointer to the allocated memory. If the device doesn't support - allocating managed memory, is returned. Support - for managed memory can be queried using the device attribute - . The allocated memory is suitably - aligned for any kind of variable. The memory is not cleared. If bytesize - is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer - is valid on the CPU and on all GPUs in the system that support managed memory. - All accesses to this pointer must obey the Unified Memory programming model. + After the event has been been opened in the importing process, + ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + ::cuEventQuery may be used in either process. Performing operations + on the imported event after the exported event has been freed + with ::cuEventDestroy will result in undefined behavior. - flags specifies the default stream association for this allocation. - flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If - ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from - any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the - allocation is created with initial visibility restricted to host access only; - an explicit call to ::cuStreamAttachMemAsync will be required to enable access - on the device. + IPC functionality is restricted to devices with support for unified + addressing on Linux operating systems. + + Pointer to a user allocated CUipcEventHandle in which to return the opaque event handle + Event allocated with ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING flags. + CUDA Error Codes: , , , + + + + Opens an interprocess event handle exported from another process with + ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + This event must be freed with ::cuEventDestroy. - If the association is later changed via ::cuStreamAttachMemAsync to - a single stream, the default association as specifed during ::cuMemAllocManaged - is restored when that stream is destroyed. For __managed__ variables, the - default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a - stream is an asynchronous operation, and as a result, the change to default - association won't happen until all work in the stream has completed. + Performing operations on the imported event after the exported event has + been freed with ::cuEventDestroy will result in undefined behavior. - Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + IPC functionality is restricted to devices with support for unified + addressing on Linux operating systems. + + Returns the imported event + Interprocess handle to open + CUDA Error Codes: , , , + + + + Takes a pointer to the base of an existing device memory allocation created + with ::cuMemAlloc and exports it for use in another process. This is a + lightweight operation and may be called multiple times on an allocation + without adverse effects. - On a multi-GPU system with peer-to-peer support, where multiple GPUs support - managed memory, the physical storage is created on the GPU which is active - at the time ::cuMemAllocManaged is called. All other GPUs will reference the - data at reduced bandwidth via peer mappings over the PCIe bus. The Unified - Memory management system does not migrate memory between GPUs. + If a region of memory is freed with ::cuMemFree and a subsequent call + to ::cuMemAlloc returns memory with the same device address, + ::cuIpcGetMemHandle will return a unique handle for the + new memory. - On a multi-GPU system where multiple GPUs support managed memory, but not - all pairs of such GPUs have peer-to-peer support between them, the physical - storage is created in 'zero-copy' or system memory. All GPUs will reference - the data at reduced bandwidth over the PCIe bus. In these circumstances, - use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to - restrict CUDA to only use those GPUs that have peer-to-peer support. This - environment variable is described in the CUDA programming guide under the - "CUDA environment variables" section. + IPC functionality is restricted to devices with support for unified + addressing on Linux operating systems. - Returned device pointer - Requested allocation size in bytes - Must be one of or - CUDA Error Codes: , , , - , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to user allocated ::CUipcMemHandle to return the handle in. + Base pointer to previously allocated device memory + CUDA Error Codes: , , , - + - Set attributes on a previously allocated memory region - The supported attributes are: - : A boolean attribute that can either be set (1) or unset (0). When set, - memory operations that are synchronous. If there are some previously initiated - synchronous memory operations that are pending when this attribute is set, the - function does not return until those memory operations are complete. - See further documentation in the section titled "API synchronization behavior" - to learn more about cases when synchronous memory operations can - exhibit asynchronous behavior. - value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + Maps memory exported from another process with ::cuIpcGetMemHandle into + the current device address space. For contexts on different devices + ::cuIpcOpenMemHandle can attempt to enable peer access between the + devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + ::cuDeviceCanAccessPeer can determine if a mapping is possible. + + Contexts that may open ::CUipcMemHandles are restricted in the following way. + ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + by one ::CUcontext per ::CUdevice per other process. + + Memory returned from ::cuIpcOpenMemHandle must be freed with + ::cuIpcCloseMemHandle. + + Calling ::cuMemFree on an exported memory region before calling + ::cuIpcCloseMemHandle in the importing context will result in undefined + behavior. + + IPC functionality is restricted to devices with support for unified + addressing on Linux operating systems. - Pointer to memory containing the value to be set - Pointer attribute to set - Pointer to a memory region allocated using CUDA memory allocation APIs + Returned device pointer + ::CUipcMemHandle to open + Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + CUDA Error Codes: , , + , , + + + + Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation + in the exporting process as well as imported mappings in other processes + will be unaffected. + + Any resources used to enable peer access will be freed if this is the + last mapping using them. + + IPC functionality is restricted to devices with support for unified + addressing on Linux operating systems. + + Device pointer returned by ::cuIpcOpenMemHandle + CUDA Error Codes: , , + , + + + + Returns information about the execution affinity support of the device. + Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. + The supported types are: + - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, + or 0 if not; + + 1 if the execution affinity type \p type is supported by the device, or 0 if not + Execution affinity type to query + Device handle + + + + Combines all API calls for context management + + + + + Creates a new CUDA context and associates it with the calling thread. The flags parameter is described in . The + context is created with a usage count of 1 and the caller of must call or + when done using the context. If a context is already current to the thread, it is supplanted by the newly created context + and may be restored by a subsequent call to . + + Returned context handle of the new context + Context creation flags. See + Device to create context on CUDA Error Codes: , , , - , , . + , , , , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns information about a pointer. - The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + Create a CUDA context with execution affinity + Creates a new CUDA context with execution affinity and associates it with + the calling thread.The \p paramsArray and \p flags parameter are described below. + The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must + call::cuCtxDestroy() or when done using the context.If a context is already + current to the thread, it is supplanted by the newly created context and may + be restored by a subsequent call to ::cuCtxPopCurrent(). + The type and the amount of execution resource the context can use is limited by \p paramsArray + and \p numParams.The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams + describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, + the latter execution affinity parameter overrides the former execution affinity parameter. - - ::CU_POINTER_ATTRIBUTE_CONTEXT - - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE - - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER - - ::CU_POINTER_ATTRIBUTE_HOST_POINTER - - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS - - ::CU_POINTER_ATTRIBUTE_BUFFER_ID - - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + The supported execution affinity types are: + ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use.The portion + of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally + rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution + affinity of the context via \p cuCtxGetExecAffinity after context creation.Currently, this attribute + is only supported under Volta+ MPS. - Number of attributes to query - An array of attributes to query (numAttributes and the number of attributes in this array should match) - A two-dimensional array containing pointers to memory - locations where the result of each attribute query will be written to. - Pointer to query + Returned context handle of the new context + + + Context creation flags. See + Device to create context on - + + + Create a CUDA context + Creates a new CUDA context and associates it with the calling thread.The + \p flags parameter is described below.The context is created with a usage + count of 1 and the caller of ::cuCtxCreate() must call::cuCtxDestroy() + when done using the context.If a context is already current to the thread, + it is supplanted by the newly created context and may be restored by a subsequent + call to::cuCtxPopCurrent(). + + CUDA context can be created with execution affinity.The type and the amount of + execution resource the context can use is limited by \p paramsArray and \p numExecAffinityParams + in \p execAffinity.The \p paramsArray is an array of \p CUexecAffinityParam and the \p numExecAffinityParams + describes the size of the paramsArray. If two \p CUexecAffinityParam in the array have the same type, + the latter execution affinity parameter overrides the former execution affinity parameter. + The supported execution affinity types are: + - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use.The portion + of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally + rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution + affinity of the context via \p cuCtxGetExecAffinity after context creation.Currently, this attribute + is only supported under Volta+ MPS. + + CUDA context can be created in CIG(CUDA in Graphics) mode by setting /p cigParams.Hardware support + and software support for graphics clients can be determined using ::cuDeviceGetAttribute() with + ::CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED.Data from graphics client is shared with CUDA via + the /p sharedData in /pcigParams.For D3D12, /p sharedData is a ID3D12CommandQueue handle. + + Either /p execAffinityParams or /p cigParams can be set to a non-null value.Setting both to a + non-null value will result in an undefined behavior. + + The three LSBs of the \p flags parameter can be used to control how the OS + thread, which owns the CUDA context at the time of an API call, interacts + with the OS scheduler when waiting for results from the GPU.Only one of + the scheduling flags can be set when creating a context. + + - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + results from the GPU. This can decrease latency when waiting for the GPU, + but may lower the performance of CPU threads if they are performing work in + parallel with the CUDA thread. + + - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + results from the GPU. This can increase latency when waiting for the GPU, + but can increase the performance of CPU threads performing work in parallel + with the GPU. + + - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + synchronization primitive when waiting for the GPU to finish work. + + - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + synchronization primitive when waiting for the GPU to finish work. + Deprecated: This flag was deprecated as of CUDA 4.0 and was + replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + + - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + uses a heuristic based on the number of active CUDA contexts in the + process \e C and the number of logical processors in the system \e P. If + \e C > \e P, then CUDA will yield to other OS threads when waiting for + the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + the power profile of the platform and may choose::CU_CTX_SCHED_BLOCKING_SYNC + for low-powered devices. + + - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + This flag must be set in order to allocate pinned host memory that is + accessible to the GPU. + + - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + after resizing local memory for a kernel. This can prevent thrashing by + local memory allocations when launching many kernels with high local + memory usage at the cost of potentially increased memory usage. + Deprecated: This flag is deprecated and the behavior enabled + by this flag is now the default and cannot be disabled. + Instead, the per-thread stack size can be controlled with::cuCtxSetLimit(). + + - ::CU_CTX_COREDUMP_ENABLE: If GPU coredumps have not been enabled globally + with::cuCoredumpSetAttributeGlobal or environment variables, this flag can + be set during context creation to instruct CUDA to create a coredump if + this context raises an exception during execution. These environment variables + are described in the CUDA-GDB user guide under the "GPU core dump support" + section. + The initial attributes will be taken from the global attributes at the time of + context creation.The other attributes that control coredump output can be + modified by calling ::cuCoredumpSetAttribute from the created context after + it becomes current.This flag is not supported when CUDA context is created in + CIG(CUDA in Graphics) mode. + + - ::CU_CTX_USER_COREDUMP_ENABLE: If user-triggered GPU coredumps have not + been enabled globally with::cuCoredumpSetAttributeGlobal or environment + variables, this flag can be set during context creation to instruct CUDA to + create a coredump if data is written to a certain pipe that is present in the + OS space.These environment variables are described in the CUDA-GDB user + guide under the "GPU core dump support" section. + It is important to note that the pipe name* must* be set with + ::cuCoredumpSetAttributeGlobal before creating the context if this flag is + used.Setting this flag implies that::CU_CTX_COREDUMP_ENABLE is set. + The initial attributes will be taken from the global attributes at the time of + context creation.The other attributes that control coredump output can be + modified by calling ::cuCoredumpSetAttribute from the created context after + it becomes current. + Setting this flag on any context creation is equivalent to setting the + ::CU_COREDUMP_ENABLE_USER_TRIGGER attribute to \p true globally. + This flag is not supported when CUDA context is created in + CIG(CUDA in Graphics) mode. + + - ::CU_CTX_SYNC_MEMOPS: Ensures that synchronous memory operations initiated + on this context will always synchronize. See further documentation in the + section titled "API Synchronization behavior" to learn more about cases when + synchronous memory operations can exhibit asynchronous behavior. + + Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + compute mode of the device. The nvidia-smi tool can be used to set + the compute mode for *devices. + Documentation for nvidia - smi can be obtained by passing a + -h option to it. + + Context creation will fail with::CUDA_ERROR_INVALID_VALUE if invalid parameter was + passed by client to create the CUDA context. + + Context creation in CIG mode will fail with::CUDA_ERROR_NOT_SUPPORTED if CIG is not supported + by the device or the driver. + + Returned context handle of the new context + Context creation parameters + Context creation flags + Device to create context on + + + - Intra-device memcpy's done with these functions may execute in parallel with the CPU, - but if host memory is involved, they wait until the copy is done before returning. + Destroys the CUDA context specified by ctx. The context ctx will be destroyed regardless of how many threads it is current to. + It is the responsibility of the calling function to ensure that no API call is issued to ctx while cuCtxDestroy_v2() is executing. + If ctx is current to the calling thread then ctx will also be + popped from the current thread's context stack (as though cuCtxPopCurrent() + were called). If ctx is current to other threads, then ctx will + remain current to those threads, and attempting to access ctx from + those threads will result in the error . + Context to destroy + CUDA Error Codes: , , , + , , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies data between two pointers. - dst and src are base pointers of the destination and source, respectively. - ByteCount specifies the number of bytes to copy. - Note that this function infers the type of the transfer (host to host, host to - device, device to device, or device to host) from the pointer values. This - function is only allowed in contexts which support unified addressing. - Note that this function is synchronous. + Increments the usage count of the context and passes back a context handle in pctx that must be passed to + when the application is done with the context. fails if there is no context current to the + thread. Currently, the flags parameter must be . - Destination unified virtual address space pointer - Source unified virtual address space pointer - Size of memory copy in bytes + Returned context handle of the current context + Context attach flags (must be ) CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device memory in one context to device memory in another - context. dstDevice is the base device pointer of the destination memory - and dstContext is the destination context. srcDevice is the base - device pointer of the source memory and srcContext is the source pointer. - ByteCount specifies the number of bytes to copy. - - Note that this function is asynchronous with respect to the host, but - serialized with respect all pending and future asynchronous work in to the - current context, srcContext, and dstContext (use - to avoid this synchronization). + Decrements the usage count of the context ctx, and destroys the context if the usage count goes to 0. The context + must be a handle that was passed back by or , and must be current to the calling thread. - Destination device pointer - Destination context - Source device pointer - Source context - Size of memory copy in bytes + Context to destroy CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 3D memory copy according to the parameters specified in - pCopy. See the definition of the structure - for documentation of its parameters. - Note that this function is synchronous with respect to the host only if - the source or destination memory is of type ::CU_MEMORYTYPE_HOST. - Note also that this copy is serialized with respect all pending and future - asynchronous work in to the current context, the copy's source context, - and the copy's destination context (use to avoid - this synchronization). + Pushes the given context ctx onto the CPU thread’s stack of current contexts. The specified context becomes the + CPU thread’s current context, so all CUDA functions that operate on the current context are affected. + The previous current context may be made current again by calling or . + The context must be "floating," i.e. not attached to any thread. Contexts are made to float by calling . - Parameters for the memory copy + Floating context to attach CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Pops the current CUDA context from the CPU thread. The CUDA context must have a usage count of 1. CUDA contexts + have a usage count of 1 upon creation; the usage count may be incremented with and decremented + with . + If successful, passes back the old context handle in pctx. That context may then be made current + to a different CPU thread by calling . + Floating contexts may be destroyed by calling . + If a context was current to the CPU thread before or was called, this function makes + that context current to the CPU thread again. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned new context handle CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Binds the specified CUDA context to the calling CPU thread. + If ctx is NULL then the CUDA context previously bound to the + calling CPU thread is unbound and is returned. + + If there exists a CUDA context stack on the calling CPU thread, this + will replace the top of that stack with ctx. + If ctx is NULL then this will be equivalent to popping the top + of the calling CPU thread's CUDA context stack (or a no-op if the + calling CPU thread's CUDA context stack is empty). - Destination device pointer - Source host pointer - Size of memory copy in bytes + Context to bind to the calling CPU thread CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in ctx the CUDA context bound to the calling CPU thread. + If no context is bound to the calling CPU thread then ctx is + set to NULL and is returned. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned context handle CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in device the ordinal of the current context’s device. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned device ID for the current context CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Blocks until the current context has completed all preceding requested tasks. + If the current context is the primary context, green contexts that have been created will also be synchronized. + returns an error if one of the + preceding tasks failed. If the context was created with the flag, the CPU thread will + block until the GPU context has finished its work. - Destination device pointer - Source host pointer - Size of memory copy in bytes CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the API version used to create ctx in version. If ctx + is NULL, returns the API version used to create the currently bound + context. + This wil return the API version used to create a context (for example, + 3010 or 3020), which library developers can use to direct callers to a + specific API version. Note that this API version may not be the same as + returned by . - Destination device pointer - Source host pointer - Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + On devices where the L1 cache and shared memory use the same hardware + resources, this function returns through pconfig the preferred cache configuration + for the current context. This is only a preference. The driver will use + the requested configuration if possible, but it is free to choose a different + configuration if required to execute functions. + This will return a pconfig of on devices + where the size of the L1 cache and shared memory are fixed. - Destination device pointer - Source host pointer - Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + On devices where the L1 cache and shared memory use the same hardware + resources, this sets through config the preferred cache configuration for + the current context. This is only a preference. The driver will use + the requested configuration if possible, but it is free to choose a different + configuration if required to execute the function. Any function preference + set via will be preferred over this context-wide + setting. Setting the context-wide cache configuration to + will cause subsequent kernel launches to prefer + to not change the cache configuration unless required to launch the kernel. + This setting does nothing on devices where the size of the L1 cache and + shared memory are fixed. + Launching a kernel with a different preference than the most recent + preference setting may insert a device-side synchronization point. - Destination device pointer - Source host pointer - Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + + + Returns the current shared memory configuration for the current context. + + This function will return in \p pConfig the current size of shared memory banks + in the current context. On devices with configurable shared memory banks, + can be used to change this setting, so that all + subsequent kernel launches will by default use the new bank size. When + is called on devices without configurable shared + memory, it will return the fixed bank size of the hardware. + + The returned bank configurations can be either: + - : set shared memory bank width to + be natively four bytes. + - : set shared memory bank width to + be natively eight bytes. + + returned shared memory configuration + CUDA Error Codes: , , , + , . + + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Sets the shared memory configuration for the current context. + On devices with configurable shared memory banks, this function will set + the context's shared memory bank size which is used for subsequent kernel + launches. + Changed the shared memory configuration between launches may insert a device + side synchronization point between those launches. + Changing the shared memory bank size will not increase shared memory usage + or affect occupancy of kernels, but may have major effects on performance. + Larger bank sizes will allow for greater potential bandwidth to shared memory, + but will change what kinds of accesses to shared memory will result in bank + conflicts. + This function will do nothing on devices with fixed shared memory bank size. + + The supported bank configurations are: + - : set bank width to the default initial + setting (currently, four bytes). + - : set shared memory bank width to + be natively four bytes. + - : set shared memory bank width to + be natively eight bytes. - Destination device pointer - Source host pointer - Size of memory copy in bytes + requested shared memory configuration CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns numerical values that correspond to the least and greatest stream priorities. + Returns in leastPriority and greatestPriority the numerical values that correspond + to the least and greatest stream priorities respectively. Stream priorities + follow a convention where lower numbers imply greater priorities. The range of + meaningful stream priorities is given by [greatestPriority, leastPriority]. + If the user attempts to create a stream with a priority value that is + outside the meaningful range as specified by this API, the priority is + automatically clamped down or up to either leastPriority or greatestPriority + respectively. See ::cuStreamCreateWithPriority for details on creating a + priority stream. + A NULL may be passed in for leastPriority or greatestPriority if the value + is not desired. + This function will return '0' in both leastPriority and greatestPriority if + the current context's device does not support stream priorities + (see ::cuDeviceGetAttribute). - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to an int in which the numerical value for least + stream priority is returned + Pointer to an int in which the numerical value for greatest stream priority is returned + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Resets all persisting lines in cache to normal status. + ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal + status.Takes effect on function return. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the execution affinity setting for the current context. + Returns in \p *pExecAffinity the current value of \p type. The supported ::CUexecAffinityType values are: + - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Records an event. + Captures in \p hEvent all the activities of the context \p hCtx + at the time of this call. \p hEvent and \p hCtx must be from the same + CUDA context, otherwise::CUDA_ERROR_INVALID_HANDLE will be returned. + Calls such as ::cuEventQuery() or ::cuCtxWaitEvent() will then examine + or wait for completion of the work that was captured. + Uses of \p hCtx after this call do not modify \p hEvent. + If the context passed to \p hCtx is the primary context, \p hEvent will + capture all the activities of the primary context and its green contexts. + If the context passed to \p hCtx is a context converted from green context + via::cuCtxFromGreenCtx(), \p hEvent will capture only the activities of the green context. + + \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED if the + specified context \p hCtx has a stream in the capture mode.In such a case, + the call will invalidate all the conflicting captures. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Context to record event for + Event to record - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Make a context wait on an event + Makes all future work submitted to context \p hCtx wait for all work + captured in \p hEvent.The synchronization will be performed on the device + and will not block the calling CPU thread.See ::cuCtxRecordEvent() + for details on what is captured by an event. + If the context passed to \p hCtx is the primary context, the primary context + and its green contexts will wait for \p hEvent. + If the context passed to \p hCtx is a context converted from green context + via ::cuCtxFromGreenCtx(), the green context will wait for \p hEvent. + + \note \p hEvent may be from a different context or device than \p hCtx. + + \note The API will return ::CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED and + invalidate the capture if the specified event \p hEvent is part of an ongoing + capture sequence or if the specified context \p hCtx has a stream in the capture mode. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Context to wait + Event to wait on - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the flags for the current context + Returns in \p *flags the flags of the current context. See ::cuCtxCreate for flag values. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to store flags of current context + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Sets the flags for the current context. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Flags to set on the current context - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the unique Id associated with the context supplied + Returns in \p ctxId the unique Id which is associated with a given context. + The Id is unique for the life of the program for this instance of CUDA. + If context is supplied as NULL and there is one current, the Id of the + current context is returned. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Context for which to obtain the Id + Pointer to store the Id of the context + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Retain the primary context on the GPU. + Retains the primary context on the device, creating it if necessary, + increasing its usage count. The caller must call + ::cuDevicePrimaryCtxRelease() when done using the context. + Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. + + Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + the device is ::CU_COMPUTEMODE_PROHIBITED. Similarly, context creation will + also fail with ::CUDA_ERROR_UNKNOWN if the compute mode for the device is + set to ::CU_COMPUTEMODE_EXCLUSIVE and there is already an active, non-primary, + context on the device. The function ::cuDeviceGetAttribute() can be used with + ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode of the + device. The nvidia-smi tool can be used to set the compute mode for + devices. Documentation for nvidia-smi can be obtained by passing a + -h option to it. + + Please note that the primary context always supports pinned allocations. Other + flags can be specified by ::cuDevicePrimaryCtxSetFlags(). - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned context handle of the new context + Device for which primary context is requested + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Release the primary context on the GPU + Releases the primary context interop on the device by decreasing the usage + count by 1. If the usage drops to 0 the primary context of device \p dev + will be destroyed regardless of how many threads it is current to. + + Please note that unlike ::cuCtxDestroy() this method does not pop the context + from stack in any circumstances. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Device which primary context is released + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Set flags for the primary context + Sets the flags for the primary context on the device overwriting perviously + set ones. If the primary context is already created + ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. + + The three LSBs of the \p flags parameter can be used to control how the OS + thread, which owns the CUDA context at the time of an API call, interacts + with the OS scheduler when waiting for results from the GPU. Only one of + the scheduling flags can be set when creating a context. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Device for which the primary context flags are set + New flags for the device + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Get the state of the primary context + Returns in \p *flags the flags for the primary context of \p dev, and in + \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + values. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Device to get primary context flags for + Pointer to store flags + Pointer to store context state; 0 = inactive, 1 = active + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Destroy all allocations and reset all state on the primary context + + Explicitly destroys and cleans up all resources associated with the current + device in the current process. + + Note that it is responsibility of the calling function to ensure that no + other module in the process is using the device any more. For that reason + it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + even after resetting the device. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Device for which primary context is destroyed + + + + + Combines all API calls for module management + + + + + Takes a filename fname and loads the corresponding module module into the current context. The CUDA driver API + does not attempt to lazily allocate the resources needed by a module; if the memory for functions and data (constant + and global) needed by the module cannot be allocated, fails. The file should be a cubin file as output + by nvcc or a PTX file, either as output by nvcc or handwrtten. + + Returned module + Filename of module to load CUDA Error Codes: , , , - , . + , , , + , , , + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Takes a byte[] as image and loads the corresponding module module into the current context. The byte array may be obtained + by mapping a cubin or PTX file, passing a cubin or PTX file as a null-terminated text string. + The byte[] is a replacement for the original pointer. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned module + Module data to load CUDA Error Codes: , , , - , . + , , + , , + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Takes a byte[] as image and loads the corresponding module module into the current context. The byte array may be obtained + by mapping a cubin or PTX file, passing a cubin or PTX file as a null-terminated text string. + Options are passed as an array via options and any corresponding parameters are passed + in optionValues. The number of total options is supplied via numOptions. Any outputs will be returned via + optionValues. Supported options are definen in . + The options values are currently passed in IntPtr-type and should then be cast into their real type. This might change in future. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned module + Module data to load + Number of options + Options for JIT + Option values for JIT CUDA Error Codes: , , , - , . + , , + , , + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Takes a byte[] as fatCubin and loads the corresponding module module into the current context. The byte[] + represents a fat binary object, which is a collection of different cubin files, all representing the same device code, but + compiled and optimized for different architectures. Prior to CUDA 4.0, there was no documented API for constructing and using + fat binary objects by programmers. Starting with CUDA 4.0, fat binary objects can be constructed by providing the -fatbin option to nvcc. + More information can be found in the nvcc document. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned module + Fat binary to load CUDA Error Codes: , , , - , . + , , , + , , + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Unloads a module hmod from the current context. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Module to unload CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in hfunc the handle of the function of name name located in module hmod. If no function of that name + exists, returns . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned function handle + Module to retrieve function from + Name of function to retrieve CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the number of functions within a module + Returns in \p count the number of functions in \p mod. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Number of functions found within the module + Module to query - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the function handles within a module. + Returns in \p functions a maximum number of \p numFunctions function handles within \p mod.When + function loading mode is set to LAZY the function retrieved may be partially loaded. The loading + state of a function can be queried using ::cuFunctionIsLoaded. CUDA APIs may load the function + automatically when called with partially loaded function handle which may incur additional + latency.Alternatively, ::cuFunctionLoad can be used to explicitly load a function. The returned + function handles become invalid when the module is unloaded. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Buffer where the function handles are returned to + Maximum number of function handles may be returned to the buffer + Module to query from - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in dptr and bytes the base pointer and size of the global of name name located in module hmod. If no + variable of that name exists, returns . Both parameters dptr + and bytes are optional. If one of them is null, it is ignored. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned global device pointer + Returned global size in bytes + Module to retrieve global from + Name of global to retrieve CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in pTexRef the handle of the texture reference of name name in the module hmod. If no texture reference + of that name exists, returns . This texture reference handle + should not be destroyed, since it will be destroyed when the module is unloaded. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned texture reference + Module to retrieve texture reference from + Name of texture reference to retrieve CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns in pSurfRef the handle of the surface reference of name name in the module hmod. If no surface reference + of that name exists, returns . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned surface reference + Module to retrieve surface reference from + Name of surface reference to retrieve CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Creates a pending JIT linker invocation. + If the call is successful, the caller owns the returned CUlinkState, which should eventually be destroyed with ::cuLinkDestroy. + The device code machine size (32 or 64 bit) will match the calling application. + Both linker and compiler options may be specified. Compiler options will be applied to inputs to this linker action which must + be compiled from PTX. The options ::CU_JIT_WALL_TIME, + ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES will accumulate data until the CUlinkState is destroyed. + optionValues must remain valid for the life of the CUlinkState if output options are used. No other references to inputs are maintained after this call returns. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Size of options arrays + Array of linker and compiler options + Array of option values, each cast to void * + On success, this will contain a CUlinkState to specify and complete this action + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Add an input to a pending linker invocation. + Ownership of data data is retained by the caller. No reference is retained to any inputs after this call returns. + This method accepts only compiler options, which are used if the data must be compiled from PTX, and does not accept any of + ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + A pending linker action. + The type of the input data. + The input data. PTX must be NULL-terminated. + The length of the input data. + An optional name for this input in log messages. + Size of options. + Options to be applied only for this input (overrides options from ::cuLinkCreate). + Array of option values, each cast to void *. + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Add a file input to a pending linker invocation. + No reference is retained to any inputs after this call returns. + This method accepts only compiler options, which are used if the data must be compiled from PTX, and does not accept any of + ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + This method is equivalent to invoking ::cuLinkAddData on the contents of the file. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + A pending linker action. + The type of the input data. + Path to the input file. + Size of options. + Options to be applied only for this input (overrides options from ::cuLinkCreate). + Array of option values, each cast to void *. + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Complete a pending linker invocation. + Completes the pending linker action and returns the cubin image for the linked + device code, which can be used with ::cuModuleLoadData. The cubin is owned by + state, so it should be loaded before state is destroyed via ::cuLinkDestroy. + This call does not destroy state. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + A pending linker invocation + On success, this will point to the output image + Optional parameter to receive the size of the generated image + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Destroys state for a JIT linker invocation. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + State object for the linker invocation + - + + + Query lazy loading mode + Returns lazy loading mode. Module loading mode is controlled by CUDA_MODULE_LOADING env variable + + Returns the lazy loading mode + + + + + Combines all API calls for library management + + + + + Load a library with specified code and options + Takes a pointer code and loads the corresponding library into + all contexts existent at the time of the call and future contexts at the time + of creation until the library is unloaded with ::cuLibraryUnload(). + The pointer may be obtained by mapping a cubin or PTX or fatbin file, + passing a cubin or PTX or fatbin file as a NULL-terminated text string, or + incorporating a cubin or fatbin object into the executable resources and + using operating system calls such as Windows FindResource() to obtain the pointer. + Options are passed as an array via jitOptions and any corresponding parameters are passed in + jitOptionsValues. The number of total JTT options is supplied via numJitOptions. + Any outputs will be returned via jitOptionsValues. + + Returned library + Code to load + Options for JIT + Option values for JIT + Number of options + Options for loading + Option values for loading + Number of options for loading + CUDA Error Codes + + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Load a library with specified code and options + Takes a pointer code and loads the corresponding library into + all contexts existent at the time of the call and future contexts at the time + of creation until the library is unloaded with ::cuLibraryUnload(). + The pointer may be obtained by mapping a cubin or PTX or fatbin file, + passing a cubin or PTX or fatbin file as a NULL-terminated text string, or + incorporating a cubin or fatbin object into the executable resources and + using operating system calls such as Windows FindResource() to obtain the pointer. + Options are passed as an array via jitOptions and any corresponding parameters are passed in + jitOptionsValues. The number of total JTT options is supplied via numJitOptions. + Any outputs will be returned via jitOptionsValues. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned library + Code to load + Options for JIT + Option values for JIT + Number of options + Options for loading + Option values for loading + Number of options for loading + CUDA Error Codes - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Load a library with specified file and options + Takes a filename fileName and loads the corresponding library library into + all contexts existent at the time of the call and future contexts at the time of + creation until the library is unloaded with ::cuLibraryUnload(). + The file should be a cubin file as output by nvcc, or a PTX file either + as output by nvcc or handwritten, or a fatbin file as output by nvcc + from toolchain 4.0 or later. + Options are passed as an array via jitOptions and any corresponding parameters are + passed in jitOptionsValues. The number of total options is supplied via numJitOptions. + Any outputs will be returned via jitOptionsValues. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned library + Code to load + Options for JIT + Option values for JIT + Number of options + Options for loading + Option values for loading + Number of options for loading + CUDA Error Codes - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Unloads a library + Unloads the library specified with library - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Library to unload + CUDA Error Codes - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns a kernel handle + Returns in \p pKernel the handle of the kernel with name \p name located in library \p library. + If kernel handle is not found, the call returns::CUDA_ERROR_NOT_FOUND. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned kernel handle + Library to retrieve kernel from + Name of kernel to retrieve + CUDA Error Codes + + + + Returns the number of kernels within a library + Returns in \p count the number of kernels in \p lib. + + Number of kernels found within the library + Library to query + + + + Retrieve the kernel handles within a library. + Returns in \p kernels a maximum number of \p numKernels kernel handles within \p lib. + The returned kernel handle becomes invalid when the library is unloaded. + + Buffer where the kernel handles are returned to + Maximum number of kernel handles may be returned to the buffer + Library to query from + + + + Returns a module handle + Returns in \p pMod the module handle associated with the current context located in library \p library. + If module handle is not found, the call returns::CUDA_ERROR_NOT_FOUND. + + Returned module handle + Library to retrieve module from + CUDA Error Codes + + + + Returns a function handle + Returns in \p pFunc the handle of the function for the requested kernel \p kernel and the current context. + If function handle is not found, the call returns ::CUDA_ERROR_NOT_FOUND. + + Returned function handle + Kernel to retrieve function for the requested context + CUDA Error Codes + + + + Returns a library handle + Returns in \p pLib the handle of the library for the requested kernel \p kernel + + Returned library handle + Kernel to retrieve library handle - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns a global device pointer + Returns in \p *dptr and \p *bytes the base pointer and size of the global with + name \p name for the requested library \p library and the current context. + If no global for the requested name \p name exists, the call returns::CUDA_ERROR_NOT_FOUND. + One of the parameters \p dptr or \p bytes (not both) can be NULL in which case it is ignored. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned global device pointer for the requested context + Returned global size in bytes + Library to retrieve global from + Name of global to retrieve + CUDA Error Codes + + + + Returns a pointer to managed memory + Returns in \p *dptr and \p *bytes the base pointer and size of the managed memory with + name \p name for the requested library \p library.If no managed memory with the + requested name \p name exists, the call returns::CUDA_ERROR_NOT_FOUND.One of the parameters + \p dptr or \p bytes (not both) can be NULL in which case it is ignored. + Note that managed memory for library \p library is shared across devices and is registered + when the library is loaded into atleast one context. + The API requires a CUDA context to be present and initialized on at least one device. + If no context is present, the call returns::CUDA_ERROR_NOT_FOUND. + + Returned pointer to the managed memory + Returned memory size in bytes + Library to retrieve global from + Name of managed memory to retrieve + CUDA Error Codes + + + + Returns a pointer to a universal function + Returns in \p *fptr the function pointer to a global function denoted by \p symbol. + If no universal function with name \p symbol exists, the call returns::CUDA_ERROR_NOT_FOUND. + If there is no device with attrubute ::CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS present in the system, + the call may return ::CUDA_ERROR_NOT_FOUND. + + Returned pointer to a universal function + Library to retrieve function pointer memory from + Name of function pointer to retrieve + CUDA Error Codes + + + + Returns information about a kernel + Returns in \p *pi the integer value of the attribute \p attrib for the kernel + \p kernel for the requested device \p dev.The supported attributes are: + - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + per block, beyond which a launch of the kernel would fail.This number + depends on both the kernel and the requested device. + - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + statically-allocated shared memory per block required by this kernel. + This does not include dynamically-allocated shared memory requested by + the user at runtime. + - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + constant memory required by this kernel. + - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + used by each thread of this kernel. + - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + of this kernel. + - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + which the kernel was compiled.This value is the major PTX version * 10 + + the minor PTX version, so a PTX version 1.3 function would return the + value 13. Note that this may return the undefined value of 0 for cubins + compiled prior to CUDA 3.0. + - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + which the kernel was compiled.This value is the major binary + version * 10 + the minor binary version, so a binary version 1.3 function + would return the value 13. Note that this will return a value of 10 for + legacy cubins that do not have a properly-encoded binary architecture + version. + - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the kernel has + been compiled with user specified option "-Xptxas --dlcm=ca" set. + - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + dynamically-allocated shared memory. + - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + cache split ratio in percent of total shared memory. + - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the + kernel must launch with a valid cluster size specified. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + blocks. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + blocks. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + blocks. + - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether + the function can be launched with non-portable cluster size. 1 is allowed, + 0 is disallowed.A non-portable cluster size may only function on the + specific SKUs the program is tested on. The launch might fail if the + program is run on a different hardware platform. CUDA API provides + cudaOccupancyMaxActiveClusters to assist with checking whether the desired + size can be launched on the current device.A portable cluster size is + guaranteed to be functional on all compute capabilities higher than the + target compute capability. The portable cluster size for sm_90 is 8 blocks + per cluster.This value may increase for future compute capabilities.The + specific hardware unit may support higher cluster sizes that’s not + guaranteed to be portable. + - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + scheduling policy of a function.The value type is CUclusterSchedulingPolicy. + \note If another thread is trying to set the same attribute on the same device using + ::cuKernelSetAttribute() simultaneously, the attribute query will give the old or new + value depending on the interleavings chosen by the OS scheduler and memory consistency. + + Returned attribute value + Attribute requested + Kernel to query attribute of + Device to query attribute of + CUDA Error Codes - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Sets information about a kernel + This call sets the value of a specified attribute \p attrib on the kernel \p kernel + for the requested device \p dev to an integer value specified by \p val. + This function returns CUDA_SUCCESS if the new value of the attribute could be + successfully set. If the set fails, this call will return an error. + Not all attributes can have values set. Attempting to set a value on a read-only + attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + Note that attributes set using ::cuFuncSetAttribute() will override the attribute + set by this API irrespective of whether the call to ::cuFuncSetAttribute() is made + before or after this API call. However, ::cuKernelGetAttribute() will always + return the attribute value set by this API. + Supported attributes are: + - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This is the maximum size in bytes of + dynamically-allocated shared memory. The value should contain the requested + maximum size of dynamically-allocated shared memory. The sum of this value and + the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + The maximal size of requestable dynamic shared memory may differ by GPU + architecture. + - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + cache and shared memory use the same hardware resources, this sets the shared memory + carveout preference, in percent of the total shared memory. + See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + This is only a hint, and the driver can choose a different ratio if required to execute the function. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + blocks. The width, height, and depth values must either all be 0 or all be + positive. The validity of the cluster dimensions is checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + blocks. The width, height, and depth values must either all be 0 or all be + positive. The validity of the cluster dimensions is checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + blocks. The width, height, and depth values must either all be 0 or all be + positive. The validity of the cluster dimensions is checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + \note The API has stricter locking requirements in comparison to its legacy counterpart + ::cuFuncSetAttribute() due to device-wide semantics. If multiple threads are trying to + set the same attribute on the same device simultaneously, the attribute setting will depend + on the interleavings chosen by the OS scheduler and memory consistency. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Attribute requested + Value to set + Kernel to set attribute of + Device to set attribute of + CUDA Error Codes + + + + Sets the preferred cache configuration for a device kernel. + On devices where the L1 cache and shared memory use the same hardware + resources, this sets through \p config the preferred cache configuration for + the device kernel \p kernel on the requested device \p dev. This is only a preference. + The driver will use the requested configuration if possible, but it is free to choose a different + configuration if required to execute \p kernel. Any context-wide preference + set via ::cuCtxSetCacheConfig() will be overridden by this per-kernel + setting. + Note that attributes set using ::cuFuncSetCacheConfig() will override the attribute + set by this API irrespective of whether the call to ::cuFuncSetCacheConfig() is made + before or after this API call. + This setting does nothing on devices where the size of the L1 cache and + shared memory are fixed. + Launching a kernel with a different preference than the most recent + preference setting may insert a device-side synchronization point. + The supported cache configurations are: + - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + \note The API has stricter locking requirements in comparison to its legacy counterpart + ::cuFuncSetCacheConfig() due to device-wide semantics. If multiple threads are trying to + set a config on the same device simultaneously, the cache config setting will depend + on the interleavings chosen by the OS scheduler and memory consistency. + + Kernel to configure cache for + Requested cache configuration + Device to set attribute of + CUDA Error Codes + + + + Returns the function name for a ::CUkernel handle + Returns in \p** name the function name associated with the kernel handle \p hfunc. + The function name is returned as a null-terminated string. The returned name is only + valid when the kernel handle is valid.If the library is unloaded or reloaded, one + must call the API again to get the updated name.This API may return a mangled name if + the function is not declared as having C linkage.If either \p** name or \p hfunc + is NULL, ::CUDA_ERROR_INVALID_VALUE is returned. + + The returned name of the function + The function handle to retrieve the name for + CUDA Error Codes + + + + Returns the offset and size of a kernel parameter in the device-side parameter layout + Queries the kernel parameter at \p paramIndex into \p kernel's list of parameters, and returns + in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter + will reside in the device-side parameter layout.This information can be used to update kernel + node parameters from the device via ::cudaGraphKernelNodeSetParam() and + ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters + that \p kernel takes. \p paramSize can be set to NULL if only the parameter offset is desired. + + The kernel to query + The parameter index to query + Returns the offset into the device-side parameter layout at which the parameter resides + Optionally returns the size of the parameter in the device-side parameter layout + + + + Combines all API calls for memory management + + + + + Returns in free and total respectively, the free and total amount of memory available for allocation by the + CUDA context, in bytes. + + Returned free memory in bytes + Returned total memory in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates bytesize bytes of linear memory on the device and returns in dptr a pointer to the allocated memory. + The allocated memory is suitably aligned for any kind of variable. The memory is not cleared. If bytesize is 0, + returns . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned device pointer + Requested allocation size in bytes CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates at least WidthInBytes * Height bytes of linear memory on the device and returns in dptr a pointer + to the allocated memory. The function may pad the allocation to ensure that corresponding pointers in any given + row will continue to meet the alignment requirements for coalescing as the address is updated from row to row. + ElementSizeBytes specifies the size of the largest reads and writes that will be performed on the memory range. + ElementSizeBytes may be 4, 8 or 16 (since coalesced memory transactions are not possible on other data sizes). If + ElementSizeBytes is smaller than the actual read/write size of a kernel, the kernel will run correctly, but possibly + at reduced speed. The pitch returned in pPitch by is the width in bytes of the allocation. The + intended usage of pitch is as a separate parameter of the allocation, used to compute addresses within the 2D array. + Given the row and column of an array element of type T, the address is computed as: + T * pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + The pitch returned by is guaranteed to work with under all circumstances. For + allocations of 2D arrays, it is recommended that programmers consider performing pitch allocations using . + Due to alignment restrictions in the hardware, this is especially true if the application will be performing + 2D memory copies between different regions of device memory (whether linear memory or CUDA arrays). + The byte alignment of the pitch returned by is guaranteed to match or exceed the alignment + requirement for texture binding with . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned device pointer + Returned pitch of allocation in bytes + Requested allocation width in bytes + Requested allocation height in rows + Size of largest reads/writes for range CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Frees the memory space pointed to by dptr, which must have been returned by a previous call to or + . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Pointer to memory to free CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns the base address in pbase and size in psize of the allocation by or + that contains the input pointer dptr. Both parameters pbase and psize are optional. If one of them is null, it is + ignored. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned base address + Returned size of device memory allocation + Device pointer to query CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates bytesize bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual + memory ranges allocated with this function and automatically accelerates calls to functions such as . + Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than + pageable memory obtained with functions such as malloc(). Allocating excessive amounts of memory with + may degrade system performance, since it reduces the amount of memory available to the system for paging. + As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned host pointer to page-locked memory + Requested allocation size in bytes CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Frees the memory space pointed to by p, which must have been returned by a previous call to . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Pointer to memory to free CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates bytesize bytes of host memory that is page-locked and accessible to the device. The driver tracks the virtual + memory ranges allocated with this function and automatically accelerates calls to functions such as . + Since the memory can be accessed directly by the device, it can be read or written with much higher bandwidth than + pageable memory obtained with functions such as malloc(). Allocating excessive amounts of pinned + memory may degrade system performance, since it reduces the amount of memory available to the system for paging. + As a result, this function is best used sparingly to allocate staging areas for data exchange between host and device. + For the Flags parameter see . + The CUDA context must have been created with the flag in order for the + flag to have any effect. + The flag may be specified on CUDA contexts for devices that do not support + mapped pinned memory. The failure is deferred to because the memory may be + mapped into other CUDA contexts via the flag. + The memory allocated by this function must be freed with . + Note all host memory allocated using will automatically + be immediately accessible to all contexts on all devices which support unified + addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + that may be used to access this host memory from those contexts is always equal + to the returned host pointer pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + is specified, then the function must be used + to query the device pointer, even if the context supports unified addressing. + See \ref CUDA_UNIFIED for additional details. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned host pointer to page-locked memory + Requested allocation size in bytes + Flags for allocation request CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Passes back the device pointer pdptr corresponding to the mapped, pinned host buffer p allocated by . + will fail if the flag was not specified at the + time the memory was allocated, or if the function is called on a GPU that does not support mapped pinned memory. + Flags provides for future releases. For now, it must be set to 0. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned device pointer + Host pointer + Options (must be 0) CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Passes back the flags pFlags that were specified when allocating the pinned host buffer p allocated by + . + will fail if the pointer does not reside in an allocation performed by or + . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned flags + Host pointer CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Page-locks the memory range specified by p and bytesize and maps it + for the device(s) as specified by Flags. This memory range also is added + to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + calls to functions such as . Since the memory can be accessed + directly by the device, it can be read or written with much higher bandwidth + than pageable memory that has not been registered. Page-locking excessive + amounts of memory may degrade system performance, since it reduces the amount + of memory available to the system for paging. As a result, this function is + best used sparingly to register staging areas for data exchange between + host and device. + The pointer p and size bytesize must be aligned to the host page size (4 KB). + The memory page-locked by this function must be unregistered with - Destination device pointer - Source host pointer - Size of memory copy in bytes + Host pointer to memory to page-lock + Size in bytes of the address range to page-lock + Flags for allocation request CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Unmaps the memory range whose base address is specified by p, and makes it pageable again. + The base address must be the same one specified to . - Destination device pointer - Source host pointer - Size of memory copy in bytes + Host pointer to memory to page-lock CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned pointer attribute value + Pointer attribute to query + Pointer CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Prefetches memory to the specified destination device + Prefetches memory to the specified destination device. devPtr is the + base device pointer of the memory to be prefetched and dstDevice is the + destination device. count specifies the number of bytes to copy. hStream + is the stream in which the operation is enqueued. + + Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device. If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages + belonging to other memory regions to make room. If there's no memory that can be + evicted, then the Unified Memory driver will prefetch less than what was requested. + + In the normal case, any mappings to the previous location of the migrated pages are + removed and mappings for the new location are only setup on the dstDevice. + The application can exercise finer control on these mappings using ::cudaMemAdvise. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to be prefetched + Size in bytes + Destination device to prefetch to + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. - + + + Prefetches memory to the specified destination location + Prefetches memory to the specified destination location. \p devPtr is the + base device pointer of the memory to be prefetched and \p location specifies the + destination location. \p count specifies the number of bytes to copy. \p hStream + is the stream in which the operation is enqueued.The memory range must refer + to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + + Specifying::CU_MEM_LOCATION_TYPE_DEVICE for ::CUmemLocation::type will prefetch memory to GPU + specified by device ordinal ::CUmemLocation::id which must have non-zero value for the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.Additionally, \p hStream must be associated with a device + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + + Specifying ::CU_MEM_LOCATION_TYPE_HOST as ::CUmemLocation::type will prefetch data to host memory. + Applications can request prefetching memory to a specific host NUMA node by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA for ::CUmemLocation::type and a valid host NUMA node id in ::CUmemLocation::id + Users can also request prefetching memory to the host NUMA node closest to the current thread's CPU by specifying + ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT for ::CUmemLocation::type.Note when ::CUmemLocation::type is etiher + ::CU_MEM_LOCATION_TYPE_HOST OR ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, ::CUmemLocation::id will be ignored. + The start address and end address of the memory range will be rounded down and rounded up + respectively to be aligned to CPU page size before the prefetch operation is enqueued + in the stream. + + If no physical memory has been allocated for this region, then this memory region + will be populated and mapped on the destination device.If there's insufficient + memory to prefetch the desired region, the Unified Memory driver may evict pages from other + ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + allocated using ::cuMemAlloc or::cuArrayCreate will not be evicted. + + By default, any mappings to the previous location of the migrated pages are removed and + mappings for the new location are only setup on the destination location.The exact behavior however + also depends on the settings applied to this memory range via::cuMemAdvise as described + below: + + If::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + then that subset will create a read-only copy of the pages on destination location. + If however the destination location is a host NUMA node, then any pages of that subset + that are already in another host NUMA node will be transferred to the destination. + + If::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + range, then the pages will be migrated to \p location even if \p location is not the + preferred location of any pages in the memory range. + + If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + then mappings to those pages from all the appropriate processors are updated to + refer to the new location if establishing such a mapping is possible.Otherwise, + those mappings are cleared. + + Note that this API is not required for functionality and only serves to improve performance + by allowing the application to migrate data to a suitable location before it is accessed. + + Memory accesses to this range are always coherent and are allowed even when the data is + actively being migrated. + + + Pointer to be prefetched + Size in bytes + Destination device to prefetch to + flags for future use, must be zero now. + Stream to enqueue prefetch operation + Note that this function is asynchronous with respect to the host and all work on other devices. + + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to. This allows the driver to create read-only + copies of the data in a processor's memory when that processor accesses it. Similarly, + if cuMemPrefetchAsync is called on this region, it will create a read-only copy of + the data on the destination processor. When a processor writes to this data, all copies + of the corresponding page are invalidated except for the one where the write occurred. + The \p device argument is ignored for this advice. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read + duplicated copies of the data will be freed no later than the next write access to that data. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + preferred location as CPU memory. Setting the preferred location does not cause data to + migrate to that location immediately. Instead, it guides the migration policy when a fault + occurs on that memory region. If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + the migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the thrash detection and resolution logic in the Unified + Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU + memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But + if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely. + When the Unified Memory driver has to evict pages from a certain location on account of that + memory being oversubscribed, the preferred location will be used to decide the destination to which + a page should be evicted to. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred + location will be ignored for that subset. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + This does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is useful in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high. But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in CPU memory. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of + mappings may be removed at any time causing accesses to result in page faults. + + Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + Device to apply the advice for + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Advise about the usage of a given memory range + Advise the Unified Memory subsystem about the usage pattern for the memory range + starting at \p devPtr with a size of \p count bytes.The start address and end address of the memory + range will be rounded down and rounded up respectively to be aligned to CPU page size before the + advice is applied.The memory range must refer to managed memory allocated via ::cuMemAllocManaged + or declared via __managed__ variables.The memory range could also refer to system-allocated pageable + memory provided it represents a valid, host-accessible region of memory and all additional constraints + imposed by \p advice as outlined below are also satisfied.Specifying an invalid system-allocated pageable + memory range results in an error being returned. + + The \p advice parameter can take the following values: + - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + from and only occasionally written to.Any read accesses from any processor to this region will create a + read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + or::cuMemPrefetchAsync_v2 is called on this region, it will create a read-only copy of the data on the destination processor. + If the target location for ::cuMemPrefetchAsync_v2 is a host NUMA node and a read-only copy already exists on + another host NUMA node, that copy will be migrated to the targeted host NUMA node. + If any processor writes to this region, all copies of the corresponding page will be invalidated + except for the one where the write occurred. If the writing processor is the CPU and the preferred location of + the page is a host NUMA node, then the page will also be migrated to that host NUMA node. The \p location argument is ignored for this advice. + Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + that has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Also, if a context is created on a device that does not have the device attribute + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + all such contexts are destroyed. + If the memory region refers to valid system-allocated pageable memory, then the accessing device must + have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + will not create a read-only copy when that device accesses this memory region. + - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + Unified Memory driver from attempting heuristic read-duplication on the memory range.Any read-duplicated + copies of the data will be collapsed into a single copy. The location for the collapsed + copy will be the preferred location if the page has a preferred location and one of the read-duplicated + copies was resident at that location.Otherwise, the location chosen is arbitrary. + Note: The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + data to be the memory belonging to \p location. When::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_HOST, + ::CUmemLocation::id is ignored and the preferred location is set to be host memory.To set the preferred location + to a specific host NUMA node, applications must set::CUmemLocation::type to ::CU_MEM_LOCATION_TYPE_HOST_NUMA and + ::CUmemLocation::id must specify the NUMA ID of the host NUMA node.If ::CUmemLocation::type is set to ::CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + ::CUmemLocation::id will be ignored and the the host NUMA node closest to the calling thread's CPU will be used as the preferred location. + If::CUmemLocation::type is a::CU_MEM_LOCATION_TYPE_DEVICE, then::CUmemLocation::id must be a valid device ordinal + and the device must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + Setting the preferred location does not cause data to migrate to that location immediately.Instead, it guides the migration policy + when a fault occurs on that memory region.If the data is already in its preferred location and the + faulting processor can establish a mapping without requiring the data to be migrated, then + data migration will be avoided. On the other hand, if the data is not in its preferred location + or if a direct mapping cannot be established, then it will be migrated to the processor accessing + it. It is important to note that setting the preferred location does not prevent data prefetching + done using ::cuMemPrefetchAsync. + Having a preferred location can override the page thrash detection and resolution logic in the Unified + Memory driver.Normally, if a page is detected to be constantly thrashing between for example host and device + memory, the page may eventually be pinned to host memory by the Unified Memory driver.But + if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice, unless read accesses from + \p location will not result in a read-only copy being created on that procesor as outlined in description for + the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is CU_MEM_LOCATION_TYPE_DEVICE + then ::CUmemLocation::id must be a valid device that has a non-zero alue for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + and changes the preferred location to none. The \p location argument is ignored for this advice. + - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by processor \p location. + The::CUmemLocation::type must be either ::CU_MEM_LOCATION_TYPE_DEVICE with ::CUmemLocation::id representing a valid device + ordinal or::CU_MEM_LOCATION_TYPE_HOST and ::CUmemLocation::id will be ignored. All other location types are invalid. + If::CUmemLocation::id is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + This advice does not cause data migration and has no impact on the location of the data per se. Instead, + it causes the data to always be mapped in the specified processor's page tables, as long as the + location of the data permits a mapping to be established. If the data gets migrated for any reason, + the mappings are updated accordingly. + This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + data located on one GPU is occasionally accessed by peer GPUs.In such scenarios, migrating data + over to the other GPUs is not as important because the accesses are infrequent and the overhead of + migration may be too high.But preventing faults can still help improve performance, and so having + a mapping set up in advance is useful.Note that on CPU access of this data, the data may be migrated + to host memory because the CPU typically cannot access device memory directly.Any GPU that had the + ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + page in host memory. + If::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + policies associated with that advice will override the policies of this advice.Additionally, if the + preferred location of this memory region or any subset of it is also \p location, then the policies + associated with::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY.Any mappings to + the data from \p location may be removed at any time causing accesses to result in non-fatal page faults. + If the memory region refers to valid system-allocated pageable memory, and::CUmemLocation::type is ::CU_MEM_LOCATION_TYPE_DEVICE + then device in ::CUmemLocation::id must have a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + Additionally, if ::CUmemLocation::id has a non-zero value for the device attribute::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + then this call has no effect. + + Note that this function is asynchronous with respect to the host and all work + on other devices. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to memory to set the advice for + Size in bytes of the memory range + Advice to be applied for the specified memory range + location to apply the advice for + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Query an attribute of a given memory range - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + A pointers to a memory location where the result of each attribute query will be written to. + Array containing the size of data + The attribute to query + Start of the range to query + Size of the range to query - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Query attributes of a given memory range. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + A two-dimensional array containing pointers to memory locations where the result of each attribute query will be written to. + Array containing the sizes of each result + An array of attributes to query (numAttributes and the number of attributes in this array should match) + Number of attributes to query + Start of the range to query + Size of the range to query - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates memory that will be automatically managed by the Unified Memory system + + Allocates bytesize bytes of managed memory on the device and returns in + dptr a pointer to the allocated memory. If the device doesn't support + allocating managed memory, is returned. Support + for managed memory can be queried using the device attribute + . The allocated memory is suitably + aligned for any kind of variable. The memory is not cleared. If bytesize + is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + is valid on the CPU and on all GPUs in the system that support managed memory. + All accesses to this pointer must obey the Unified Memory programming model. + + flags specifies the default stream association for this allocation. + flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + allocation is created with initial visibility restricted to host access only; + an explicit call to ::cuStreamAttachMemAsync will be required to enable access + on the device. + + If the association is later changed via ::cuStreamAttachMemAsync to + a single stream, the default association as specifed during ::cuMemAllocManaged + is restored when that stream is destroyed. For __managed__ variables, the + default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + stream is an asynchronous operation, and as a result, the change to default + association won't happen until all work in the stream has completed. + + Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + + On a multi-GPU system with peer-to-peer support, where multiple GPUs support + managed memory, the physical storage is created on the GPU which is active + at the time ::cuMemAllocManaged is called. All other GPUs will reference the + data at reduced bandwidth via peer mappings over the PCIe bus. The Unified + Memory management system does not migrate memory between GPUs. + + On a multi-GPU system where multiple GPUs support managed memory, but not + all pairs of such GPUs have peer-to-peer support between them, the physical + storage is created in 'zero-copy' or system memory. All GPUs will reference + the data at reduced bandwidth over the PCIe bus. In these circumstances, + use of the environment variable, CUDA_VISIBLE_DEVICES, is recommended to + restrict CUDA to only use those GPUs that have peer-to-peer support. This + environment variable is described in the CUDA programming guide under the + "CUDA environment variables" section. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Returned device pointer + Requested allocation size in bytes + Must be one of or CUDA Error Codes: , , , - , . + , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Registers a callback function to receive async notifications + Registers \p callbackFunc to receive async notifications. + The \p userData parameter is passed to the callback function at async notification time. + Likewise, \p callback is also passed to the callback function to distinguish between + multiple registered callbacks. + The callback function being registered should be designed to return quickly (~10ms). + Any long running tasks should be queued for execution on an application thread. + Callbacks may not call cuDeviceRegisterAsyncNotification or cuDeviceUnregisterAsyncNotification. + Doing so will result in ::CUDA_ERROR_NOT_PERMITTED.Async notification callbacks execute + in an undefined order and may be serialized. + Returns in \p* callback a handle representing the registered callback instance. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + The device on which to register the callback + The function to register as a callback + A generic pointer to user data. This is passed into the callback function. + A handle representing the registered callback instance - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Unregisters an async notification callback + Unregisters \p callback so that the corresponding callback function will stop receiving + async notifications. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + The device from which to remove \p callback. + The callback instance to unregister from receiving async notifications. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Set attributes on a previously allocated memory region + The supported attributes are: + : A boolean attribute that can either be set (1) or unset (0). When set, + memory operations that are synchronous. If there are some previously initiated + synchronous memory operations that are pending when this attribute is set, the + function does not return until those memory operations are complete. + See further documentation in the section titled "API synchronization behavior" + to learn more about cases when synchronous memory operations can + exhibit asynchronous behavior. + value will be considered as a pointer to an unsigned integer to which this attribute is to be set. - Destination device pointer - Source host pointer - Size of memory copy in bytes + Pointer to memory containing the value to be set + Pointer attribute to set + Pointer to a memory region allocated using CUDA memory allocation APIs CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , , . - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Returns information about a pointer. + The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + + - ::CU_POINTER_ATTRIBUTE_CONTEXT + - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + - ::CU_POINTER_ATTRIBUTE_IS_MANAGED - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Number of attributes to query + An array of attributes to query (numAttributes and the number of attributes in this array should match) + A two-dimensional array containing pointers to memory + locations where the result of each attribute query will be written to. + Pointer to query + - + + + Allocate an address range reservation. + Reserves a virtual address range based on the given parameters, giving + the starting address of the range in \p ptr.This API requires a system that + supports UVA.The size and address parameters must be a multiple of the + host page size and the alignment must be a power of two or zero for default + alignment. + + Resulting pointer to start of virtual address range allocated + Size of the reserved virtual address range requested + Alignment of the reserved virtual address range requested + Fixed starting address range requested + Currently unused, must be zero + + + + Free an address range reservation. + Frees a virtual address range reserved by cuMemAddressReserve. The size + must match what was given to memAddressReserve and the ptr given must + match what was returned from memAddressReserve. + + Starting address of the virtual address range to free + Size of the virtual address region to free + + + + Create a shareable memory handle representing a memory allocation of a given size described by the given properties + This creates a memory allocation on the target device specified through the + \p prop strcuture.The created allocation will not have any device or host + mappings.The generic memory \p handle for the allocation can be + mapped to the address space of calling process via::cuMemMap.This handle + cannot be transmitted directly to other processes(see + ::cuMemExportToShareableHandle). On Windows, the caller must also pass + an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which + limits or allows access to this handle for a recepient process (see + ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this + allocation must be a multiple of the the value given via + ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM + flag. + + Value of handle returned. All operations on this allocation are to be performed using this handle. + Size of the allocation requested + Properties of the allocation to create. + flags for future use, must be zero now. + + + + Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. + Frees the memory that was allocated on a device through cuMemCreate. + + The memory allocation will be freed when all outstanding mappings to the memory + are unmapped and when all outstanding references to the handle(including it's + shareable counterparts) are also released.The generic memory handle can be + freed when there are still outstanding mappings made with this handle.Each + time a recepient process imports a shareable handle, it needs to pair it with + ::cuMemRelease for the handle to be freed.If \p handle is not a valid handle + the behavior is undefined. + + handle Value of handle which was returned previously by cuMemCreate. + + + + Maps an allocation handle to a reserved virtual address range. + Maps bytes of memory represented by \p handle starting from byte \p offset to + \p size to address range[\p addr, \p addr + \p size]. This range must be an + address reservation previously reserved with ::cuMemAddressReserve, and + \p offset + \p size must be less than the size of the memory allocation. + Both \p ptr, \p size, and \p offset must be a multiple of the value given via + ::cuMemGetAllocationGranularity with the::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. + Please note calling::cuMemMap does not make the address accessible, + the caller needs to update accessibility of a contiguous mapped VA + range by calling::cuMemSetAccess. + Once a recipient process obtains a shareable memory handle + from::cuMemImportFromShareableHandle, the process must + use ::cuMemMap to map the memory into its address ranges before + setting accessibility with::cuMemSetAccess. + ::cuMemMap can only create mappings on VA range reservations + that are not currently mapped. + + Address where memory will be mapped. + Size of the memory mapping. + Offset into the memory represented by \p handle from which to start mapping - Note: currently must be zero. + Handle to a shareable memory + flags for future use, must be zero now. + + + + Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays + + List of ::CUarrayMapInfo + Count of ::CUarrayMapInfo in \p mapInfoList + Stream identifier for the stream to use for map or unmap operations + + + + + Unmap the backing memory of a given address range. + The range must be the entire contiguous address range that was mapped to. In + other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped + by::cuMemCreate / ::cuMemMap.Any backing memory allocations will be freed + if there are no existing mappings and there are no unreleased memory handles. + When::cuMemUnmap returns successfully the address range is converted to an + address reservation and can be used for a future calls to ::cuMemMap.Any new + mapping to this virtual address will need to have access granted through + ::cuMemSetAccess, as all mappings start with no accessibility setup. + + Starting address for the virtual address range to unmap + Size of the virtual address range to unmap + + + + Set the access flags for each location specified in \p desc for the given virtual address range + Given the virtual address range via \p ptr and \p size, and the locations + in the array given by \p desc and \p count, set the access flags for the + target locations.The range must be a fully mapped address range + containing all allocations created by ::cuMemMap / ::cuMemCreate. + + Starting address for the virtual address range + Length of the virtual address range + Array of ::CUmemAccessDesc that describe how to change the mapping for each location specified + Number of ::CUmemAccessDesc in \p desc + + + + Get the access \p flags set for the given \p location and \p ptr + + Flags set for this location + Location in which to check the flags for + Address in which to check the access flags for + + + + Exports an allocation to a requested shareable handle type + Given a CUDA memory handle, create a shareable memory + allocation handle that can be used to share the memory with other + processes.The recipient process can convert the shareable handle back into a + CUDA memory handle using ::cuMemImportFromShareableHandle and map + it with::cuMemMap.The implementation of what this handle is and how it + can be transferred is defined by the requested handle type in \p handleType + Once all shareable handles are closed and the allocation is released, the allocated + memory referenced will be released back to the OS and uses of the CUDA handle afterward + will lead to undefined behavior. + This API can also be used in conjunction with other APIs (e.g.Vulkan, OpenGL) + that support importing memory from the shareable type + + Pointer to the location in which to store the requested handle type + CUDA handle for the memory allocation + Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) + Reserved, must be zero + + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Imports an allocation from a requested shareable handle type. + If the current process cannot support the memory described by this shareable + handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. + \note Importing shareable handles exported from some graphics APIs(Vulkan, OpenGL, etc) + created on devices under an SLI group may not be supported, and thus this API will + return CUDA_ERROR_NOT_SUPPORTED. + There is no guarantee that the contents of \p handle will be the same CUDA memory handle + for the same given OS shareable handle, or the same underlying allocation. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + CUDA Memory handle for the memory allocation. + Shareable Handle representing the memory allocation that is to be imported. + handle type of the exported handle ::CUmemAllocationHandleType. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Calculates either the minimal or recommended granularity + Calculates either the minimal or recommended granularity + for a given allocation specification and returns it in granularity.This + granularity can be used as a multiple for alignment, size, or address mapping. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + granularity Returned granularity. + prop Property for which to determine the granularity for + option Determines which granularity to return - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Retrieve the contents of the property structure defining properties for this handle - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to a properties structure which will hold the information about this handle + Handle which to perform the query on + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Given an address \p addr, returns the allocation handle of the backing memory allocation. + The handle is guaranteed to be the same handle value used to map the memory. If the address + requested is not mapped, the function will fail.The returned handle must be released with + corresponding number of calls to::cuMemRelease. + + The address \p addr, can be any address in a range previously mapped + by::cuMemMap, and not necessarily the start address. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + CUDA Memory handle for the backing memory allocation. + Memory address to query, that has been mapped previously. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Frees memory with stream ordered semantics + Inserts a free operation into \p hStream. + The allocation must not be accessed after stream execution reaches the free. + After this API returns, accessing the memory from any subsequent work launched on the GPU + or querying its pointer attributes results in undefined behavior. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + memory to free + The stream establishing the stream ordering contract. + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Allocates memory with stream ordered semantics + Inserts an allocation operation into \p hStream. + A pointer to the allocated memory is returned immediately in *dptr. + The allocation must not be accessed until the the allocation operation completes. + The allocation comes from the memory pool current to the stream's device. + + note The default memory pool of a device contains device memory from that device. + note Basic stream ordering allows future work submitted into the same stream to use the allocation. + Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + operation completes before work submitted in a separate stream runs. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned device pointer + Number of bytes to allocate + The stream establishing the stream ordering contract and the memory pool to allocate from + + + + + Tries to release memory back to the OS + Releases memory back to the OS until the pool contains fewer than minBytesToKeep + reserved bytes, or there is no more memory that the allocator can safely release. + The allocator cannot release OS allocations that back outstanding asynchronous allocations. + The OS allocations may happen at different granularity from the user allocations. + + note: Allocations that have not been freed count as outstanding. + note: Allocations that have been asynchronously freed but whose completion has + not been observed on the host (eg.by a synchronize) can count as outstanding. + + The memory pool to trim + If the pool has less than minBytesToKeep reserved, + the TrimTo operation is a no-op.Otherwise the pool will be guaranteed to have at least minBytesToKeep bytes reserved after the operation. + + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The memory pool to modify + The attribute to modify + Pointer to the value to assign + + + + + Sets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next + call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency + between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies + in order to establish the stream ordering required to reuse + a piece of memory released by::cuMemFreeAsync(default enabled). + + The memory pool to modify + The attribute to modify + Pointer to the value to assign + + + + + Gets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying + to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the + allocator will try to release memory back to the OS on the + next call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering + required to reuse a piece of memory released by::cuMemFreeAsync(default enabled). + + The memory pool to get attributes of + The attribute to get + Retrieved value + + + + + Gets attributes of a memory pool + Supported attributes are: + - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + Amount of reserved memory in bytes to hold onto before trying + to release memory back to the OS.When more than the release + threshold bytes of memory are held by the memory pool, the + allocator will try to release memory back to the OS on the + next call to stream, event or context synchronize. (default 0) + - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to use memory asynchronously freed + in another stream as long as a stream ordering dependency + of the allocating stream on the free action exists. + Cuda events and null stream interactions can create the required + stream ordered dependencies. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + Allow reuse of already completed frees when there is no dependency between the free and allocation. (default enabled) + - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + Allow::cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering + required to reuse a piece of memory released by::cuMemFreeAsync(default enabled). + + The memory pool to get attributes of + The attribute to get + Retrieved value + + + + + Controls visibility of pools between devices + + The pool being modified + Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. + Number of descriptors in the map array. + + + + + Returns the accessibility of a pool from a device + Returns the accessibility of the pool's memory from the specified location. + + the accessibility of the pool from the specified location + the pool being queried + the location accessing the pool + + + + + Creates a memory pool + Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines + the properties of the pool such as the backing device and IPC capabilities. + By default, the pool's memory will be accessible from the device it is allocated on. + note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. + + + + + + + + Destroys the specified memory pool + If any pointers obtained from this pool haven't been freed or + the pool has free operations that haven't completed + when::cuMemPoolDestroy is invoked, the function will return immediately and the + resources associated with the pool will be released automatically + once there are no more outstanding allocations. + Destroying the current mempool of a device sets the default mempool of + that device as the current mempool for that device. + note A device's default memory pool cannot be destroyed. + + + + + + + Allocates memory from a specified pool with stream ordered semantics. + Inserts an allocation operation into \p hStream. + A pointer to the allocated memory is returned immediately in *dptr. + The allocation must not be accessed until the the allocation operation completes. + The allocation comes from the specified memory pool. + note + - The specified memory pool may be from a device different than that of the specified \p hStream. + - Basic stream ordering allows future work submitted into the same stream to use the allocation. + Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + operation completes before work submitted in a separate stream runs. + + Returned device pointer + Number of bytes to allocate + The pool to allocate from + The stream establishing the stream ordering semantic + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Exports a memory pool to the requested handle type. + Given an IPC capable mempool, create an OS handle to share the pool with another process. + A recipient process can convert the shareable handle into a mempool with::cuMemPoolImportFromShareableHandle. + Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. + The implementation of what the shareable handle is and how it can be transferred is defined by the requested + handle type. + note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned OS handle + pool to export + the type of handle to create + must be 0 + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + imports a memory pool from a shared handle. + Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. + note Imported memory pools do not support creating new allocations. As such imported memory pools + may not be used in cuDeviceSetMemPool or ::cuMemAllocFromPoolAsync calls. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned memory pool + OS handle of the pool to open + The type of handle being imported + must be 0 + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Export data to share a memory pool allocation between processes. + Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + The recipient process can import the allocation with the::cuMemPoolImportPointer api. + The data is not a handle and may be shared through any IPC mechanism. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned export data + pointer to memory being exported + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Import a memory pool allocation from another process. + Returns in \p ptr_out a pointer to the imported memory. + The imported memory must not be accessed before the allocation operation completes + in the exporting process.The imported memory must be freed from all importing processes before + being freed in the exporting process.The pointer may be freed with cuMemFree + or cuMemFreeAsync.If cuMemFreeAsync is used, the free must be completed + on the importing process before the free operation on the exporting process. + note The cuMemFreeAsync api may be used in the exporting process before + the cuMemFreeAsync operation completes in its stream as long as the + cuMemFreeAsync in the exporting process specifies a stream with + a stream dependency on the importing process's cuMemFreeAsync. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + pointer to imported memory + pool from which to import + data specifying the memory to import + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Retrieve handle for an address range + Get a handle of the specified type to an address range. The address range + must have been obtained by a prior call to either ::cuMemAlloc or::cuMemAddressReserve. + If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via::cuMemMap. + Users must ensure the \p dptr and \p size are aligned to the host page size. + When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + users are expected to query for dma_buf support for the platform + by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling + this API.The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor. + Users must ensure the entire address range is backed and mapped when + the address range is allocated by ::cuMemAddressReserve.All the physical + allocations backing the address range must be resident on the same device and + have identical allocation properties. Users are also expected to retrieve a + new handle every time the underlying physical allocation(s) corresponding + to a previously queried VA range are changed. - Destination device pointer - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to the location where the returned handle will be stored. + Pointer to a valid CUDA device allocation. Must be aligned to host page size. + Length of the address range. Must be aligned to host page size. + Type of handle requested (defines type and size of the \p handle output parameter) + Reserved, must be zero + - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Intra-device memcpy's done with these functions may execute in parallel with the CPU, + but if host memory is involved, they wait until the copy is done before returning. - Destination device pointer - Source host pointer + + + + Copies data between two pointers. + dst and src are base pointers of the destination and source, respectively. + ByteCount specifies the number of bytes to copy. + Note that this function infers the type of the transfer (host to host, host to + device, device to device, or device to host) from the pointer values. This + function is only allowed in contexts which support unified addressing. + Note that this function is synchronous. + + Destination unified virtual address space pointer + Source unified virtual address space pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from device memory in one context to device memory in another + context. dstDevice is the base device pointer of the destination memory + and dstContext is the destination context. srcDevice is the base + device pointer of the source memory and srcContext is the source pointer. + ByteCount specifies the number of bytes to copy. + + Note that this function is asynchronous with respect to the host, but + serialized with respect all pending and future asynchronous work in to the + current context, srcContext, and dstContext (use + to avoid this synchronization). Destination device pointer - Source host pointer + Destination context + Source device pointer + Source context Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Perform a 3D memory copy according to the parameters specified in + pCopy. See the definition of the structure + for documentation of its parameters. + Note that this function is synchronous with respect to the host only if + the source or destination memory is of type ::CU_MEMORYTYPE_HOST. + Note also that this copy is serialized with respect all pending and future + asynchronous work in to the current context, the copy's source context, + and the copy's destination context (use to avoid + this synchronization). - Destination device pointer - Source host pointer - Size of memory copy in bytes + Parameters for the memory copy CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90604,7 +112198,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90616,7 +112210,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90628,7 +112222,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90640,7 +112234,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90652,7 +112246,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90664,7 +112258,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90676,7 +112270,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90688,7 +112282,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90700,7 +112294,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90712,7 +112306,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90724,7 +112318,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90736,7 +112330,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90748,7 +112342,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90760,7 +112354,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90772,7 +112366,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90784,7 +112378,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90796,7 +112390,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90808,7 +112402,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90820,7 +112414,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90832,7 +112426,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90844,7 +112438,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -90856,1159 +112450,1027 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - - - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - - Destination host pointer - Source device pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination host pointer - Source device pointer + Destination device pointer + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92020,7 +113482,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92032,7 +113494,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92044,7 +113506,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92056,7 +113518,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92068,7 +113530,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92080,7 +113542,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92092,7 +113554,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92104,7 +113566,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92116,7 +113578,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92128,7 +113590,7 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. @@ -92140,3971 +113602,3976 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device memory to device memory. dstDevice and srcDevice are the base pointers of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is asynchronous. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer + Destination host pointer Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting index of the destination data. srcDevice specifies the base pointer of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array + Destination host pointer Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - - - Copies from one 1D CUDA array to device memory. dstDevice specifies the base pointer of the destination and - must be naturally aligned with the CUDA array elements. srcArray and srcOffset specify the CUDA array - handle and the offset in bytes into the array where the copy is to begin. ByteCount specifies the number of bytes to - copy and must be evenly divisible by the array element size. - - Destination device pointer - Source array - Offset in bytes of source array - Size of memory copy in bytes. Must be evenly divisible by the array element size. - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer - Size of memory copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. - - Destination array - Offset in bytes of destination array - Source host pointer + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. + + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination array - Offset in bytes of destination array - Source host pointer + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is synchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination host pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from device memory to device memory. dstDevice and srcDevice are the base pointers of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is asynchronous. - Destination device pointer - Source array - Offset in bytes of source array + Destination device pointer + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from device memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting index of the destination data. srcDevice specifies the base pointer of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source device pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. + Copies from one 1D CUDA array to device memory. dstDevice specifies the base pointer of the destination and + must be naturally aligned with the CUDA array elements. srcArray and srcOffset specify the CUDA array + handle and the offset in bytes into the array where the copy is to begin. ByteCount specifies the number of bytes to + copy and must be evenly divisible by the array element size. - Destination device pointer + Destination device pointer Source array Offset in bytes of source array - Size of memory copy in bytes + Size of memory copy in bytes. Must be evenly divisible by the array element size. CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies the number of bytes to copy. - Destination device pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to another. dstArray and srcArray specify the handles of the destination and - source CUDA arrays for the copy, respectively. dstOffset and srcOffset specify the destination and source - offsets in bytes into the CUDA arrays. ByteCount is the number of bytes to be copied. The size of the elements - in the CUDA arrays need not be the same format, but the elements must be the same size; and count must be evenly - divisible by that size. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. Destination array Offset in bytes of destination array - Source array - Offset in bytes of source array + Source host pointer Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 2D memory copy according to the parameters specified in pCopy. See . - returns an error if any pitch is greater than the maximum allowed (). - passes back pitches that always work with . On intra-device - memory copies (device ]]> device, CUDA array ]]> device, CUDA array ]]> CUDA array), may fail - for pitches not computed by . does not have this restriction, but - may run significantly slower in the cases where would have returned an error code. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 2D memory copy according to the parameters specified in pCopy. See . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 3D memory copy according to the parameters specified in pCopy. See . - The srcLOD and dstLOD members of the CUDAMemCpy3D structure must be set to 0. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost). - memcpy's done with these functions execute in parallel with the CPU and, if - the hardware is available, may execute in parallel with the GPU. - Asynchronous memcpy must be accompanied by appropriate stream synchronization. - + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies data between two pointers. - dst and src are base pointers of the destination and source, respectively. - ByteCount specifies the number of bytes to copy. - Note that this function infers the type of the transfer (host to host, host to - device, device to device, or device to host) from the pointer values. This - function is only allowed in contexts which support unified addressing. - Note that this function is asynchronous and can optionally be associated to - a stream by passing a non-zero hStream argument + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination unified virtual address space pointer - Source unified virtual address space pointer + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device memory in one context to device memory in another - context. dstDevice is the base device pointer of the destination memory - and dstContext is the destination context. srcDevice is the base - device pointer of the source memory and srcContext is the source pointer. - ByteCount specifies the number of bytes to copy. Note that this function - is asynchronous with respect to the host and all work in other streams in - other devices. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Destination context - Source device pointer - Source context + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 3D memory copy according to the parameters specified in - pCopy. See the definition of the structure - for documentation of its parameters. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. - is asynchronous and can optionally be associated to a stream by passing a non-zero hStream - argument. It only works on page-locked memory and returns an error if a pointer to pageable memory is passed as - input. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer + Destination array + Offset in bytes of destination array Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and - source, respectively. ByteCount specifies the number of bytes to copy. - is asynchronous and can optionally be associated to a stream by passing a non-zero - hStream argument. It only works on page-locked memory and returns an error if a pointer to pageable memory - is passed as input. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination host pointer - Source device pointer + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from device memory to device memory. dstDevice and srcDevice are the base pointers of the destination - and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is asynchronous - and can optionally be associated to a stream by passing a non-zero hStream argument. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Source device pointer + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and - starting offset in bytes of the destination data. srcHost specifies the base address of the source. ByteCount - specifies the number of bytes to copy. - is asynchronous and can optionally be associated to a stream by passing a non-zero - hStream argument. It only works on page-locked memory and returns an error if a pointer to pageable memory - is passed as input. + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. Destination array Offset in bytes of destination array Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray - and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies - the number of bytes to copy. - is asynchronous and can optionally be associated to a stream by passing a non-zero stream hStream - argument. It only works on page-locked host memory and returns an error if a pointer to pageable memory is passed - as input. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination pointer - Source array - Offset in bytes of source array + Destination array + Offset in bytes of destination array + Source host pointer Size of memory copy in bytes - Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 2D memory copy according to the parameters specified in pCopy. See . - returns an error if any pitch is greater than the maximum allowed (). - passes back pitches that always work with . On intra-device - memory copies (device ]]> device, CUDA array ]]> device, CUDA array ]]> CUDA array), may fail - for pitches not computed by . (not async!) does not have this restriction, but - may run significantly slower in the cases where would have returned an error code. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Perform a 3D memory copy according to the parameters specified in pCopy. See . - returns an error if any pitch is greater than the maximum allowed (). - is asynchronous and can optionally be associated to a stream by passing a non-zero hStream - argument. It only works on page-locked host memory and returns an error if a pointer to pageable memory is passed - as input. - The srcLOD and dstLOD members of the CUDAMemCpy3D structure must be set to 0. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Parameters for the memory copy - Stream indetifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. - - - - Combines all memset API calls - + Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the memory range of N 8-bit values to the specified value b. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the memory range of N 16-bit values to the specified value us. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the memory range of N 32-bit values to the specified value ui. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 8-bit values to the specified value b. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 16-bit values to the specified value us. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 32-bit values to the specified value us. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - - - Combines all async memset API calls - - - + - Sets the memory range of N 8-bit values to the specified value b. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the memory range of N 16-bit values to the specified value us. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the memory range of N 32-bit values to the specified value ui. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Value to set - Number of elements - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 8-bit values to the specified value b. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 16-bit values to the specified value us. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the 2D memory range of Width 32-bit values to the specified value us. Height specifies the number of rows to - set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is - one that has been passed back by . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Destination device pointer - Pitch of destination device pointer - Value to set - Width of row - Number of rows - Stream identifier + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - - - Combines all function / kernel API calls - - - + - Specifies the x, y, and z dimensions of the thread blocks that are created when the kernel given by hfunc is launched. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Kernel to specify dimensions of - X dimension - Y dimension - Z dimension + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets through bytes the amount of dynamic shared memory that will be available to each thread block when the kernel - given by hfunc is launched. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Kernel to specify dynamic shared-memory size for - Dynamic shared-memory size per thread in bytes + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pi the integer value of the attribute attrib on the kernel given by hfunc. See . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned attribute value - Attribute requested - Function to query attribute of + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - - - Sets information about a function - - This call sets the value of a specified attribute \p attrib on the kernel given - by \p hfunc to an integer value specified by \p val - - This function returns CUDA_SUCCESS if the new value of the attribute could be - successfully set. If the set fails, this call will return an error. - - Not all attributes can have values set. Attempting to set a value on a read-only - attribute will result in an error (CUDA_ERROR_INVALID_VALUE) - - Supported attributes for the cuFuncSetAttribute call are: - - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of - dynamically-allocated shared memory.The value should contain the requested - maximum size of dynamically-allocated shared memory.The sum of this value and - the function attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the - device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. - The maximal size of requestable dynamic shared memory may differ by GPU - architecture. - - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 - cache and shared memory use the same hardware resources, this sets the shared memory - carveout preference, in percent of the total resources.This is only a hint, and the - driver can choose a different ratio if required to execute the function. - - Function to query attribute of - Attribute requested - The value to set - - + - On devices where the L1 cache and shared memory use the same hardware resources, this sets through config - the preferred cache configuration for the device function hfunc. This is only a preference. The driver will use the - requested configuration if possible, but it is free to choose a different configuration if required to execute hfunc. - This setting does nothing on devices where the size of the L1 cache and shared memory are fixed. - Switching between configuration modes may insert a device-side synchronization point for streamed kernel launches. - The supported cache modes are defined in + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Kernel to configure cache for - Requested cache configuration + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the shared memory configuration for a device function. - On devices with configurable shared memory banks, this function will - force all subsequent launches of the specified device function to have - the given shared memory bank size configuration. On any given launch of the - function, the shared memory configuration of the device will be temporarily - changed if needed to suit the function's preferred configuration. Changes in - shared memory configuration between subsequent launches of functions, - may introduce a device side synchronization point. - Any per-function setting of shared memory bank size set via - will override the context wide setting set with - . - Changing the shared memory bank size will not increase shared memory usage - or affect occupancy of kernels, but may have major effects on performance. - Larger bank sizes will allow for greater potential bandwidth to shared memory, - but will change what kinds of accesses to shared memory will result in bank - conflicts. - This function will do nothing on devices with fixed shared memory bank size. - The supported bank configurations are - - : set bank width to the default initial - setting (currently, four bytes). - - : set shared memory bank width to - be natively four bytes. - - : set shared memory bank width to - be natively eight bytes. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - kernel to be given a shared memory config - requested shared memory configuration - CUDA Error Codes: , , , , - . + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Combines all array management API calls + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Creates a CUDA array according to the structure pAllocateArray and returns a - handle to the new CUDA array in pHandle. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned array - Array descriptor + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pArrayDescriptor a descriptor containing information on the format and dimensions of the CUDA - array hArray. It is useful for subroutines that have been passed a CUDA array, but need to know the CUDA array - parameters for validation or other purposes. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned array descriptor - Array to get descriptor of + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Destroys the CUDA array hArray. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Array to destroy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Creates a CUDA array according to the structure pAllocateArray and returns - a handle to the new CUDA array in pHandle. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned array - 3D array descriptor + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pArrayDescriptor a descriptor containing information on the format and dimensions of the CUDA - array hArray. It is useful for subroutines that have been passed a CUDA array, but need to know the CUDA array - parameters for validation or other purposes. - This function may be called on 1D and 2D arrays, in which case the Height and/or Depth members of the descriptor - struct will be set to 0. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned 3D array descriptor - 3D array to get descriptor of + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . Note that this function may also return error codes from previous, asynchronous launches. - + - Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure - pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in pHandle. - numMipmapLevels specifies the number of mipmap levels to be allocated. This value is - clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. - - Returned mipmapped array - mipmapped array descriptor - Number of mipmap levels + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. + + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pLevelArray a CUDA array that represents a single mipmap level - of the CUDA mipmapped array hMipmappedArray. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned mipmap level CUDA array - CUDA mipmapped array - Mipmap level + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Destroys the CUDA mipmapped array hMipmappedArray. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Mipmapped array to destroy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , - , , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Groups all texture reference management API calls + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Creates a texture reference and returns its handle in pTexRef. Once created, the application must call - or to associate the reference with allocated memory. Other texture reference functions - are used to specify the format and interpretation (addressing, filtering, etc.) to be used when the memory is read - through this texture reference. To associate the texture reference with a texture ordinal for a given function, the - application should call . + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned texture reference + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Destroys the texture reference specified by hTexRef. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Texture reference to destroy + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Binds the CUDA array hArray to the texture reference hTexRef. Any previous address or CUDA array state - associated with the texture reference is superseded by this function. Flags must be set to - . Any CUDA array previously bound to hTexRef is unbound. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Texture reference to bind - Array to bind - Options (must be ) + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Binds the CUDA mipmapped array hMipmappedArray to the texture reference hTexRef. - Any previous address or CUDA array state associated with the texture reference - is superseded by this function. Flags must be set to . - Any CUDA array previously bound to hTexRef is unbound. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Texture reference to bind - Mipmapped array to bind - Options (must be ) + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Binds a linear address range to the texture reference hTexRef. Any previous address or CUDA array state associated - with the texture reference is superseded by this function. Any memory previously bound to hTexRef is unbound. - Since the hardware enforces an alignment requirement on texture base addresses, passes back - a byte offset in ByteOffset that must be applied to texture fetches in order to read from the desired memory. This - offset must be divided by the texel size and passed to kernels that read from the texture so they can be applied to the - tex1Dfetch() function. - If the device memory pointer was returned from , the offset is guaranteed to be 0 and null may be - passed as the ByteOffset parameter. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Returned byte offset - Texture reference to bind - Device pointer to bind - Size of memory to bind in bytes + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Binds a linear address range to the texture reference hTexRef. Any previous address or CUDA array state associated - with the texture reference is superseded by this function. Any memory previously bound to hTexRef is unbound. - Using a tex2D() function inside a kernel requires a call to either to bind the corresponding texture - reference to an array, or to bind the texture reference to linear memory. - Function calls to cannot follow calls to for the same texture reference. - It is required that dptr be aligned to the appropriate hardware-specific texture alignment. You can query this value - using the device attribute . If an unaligned dptr is supplied, - is returned. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Texture reference to bind - Descriptor of CUDA array - Device pointer to bind - Line pitch in bytes> + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the format of the data to be read by the texture reference hTexRef. fmt and NumPackedComponents - are exactly analogous to the Format and NumChannels members of the structure: - They specify the format of each component and the number of components per array element. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. pSrc specifies the base address of the source. ByteCount specifies + the number of bytes to copy. - Texture reference - Format to set - Number of components per array element + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the addressing mode am for the given dimension dim of the texture reference hTexRef. If dim is zero, - the addressing mode is applied to the first parameter of the functions used to fetch from the texture; if dim is 1, the - second, and so on. See . - Note that this call has no effect if hTexRef is bound to linear memory. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Dimension - Addressing mode to set + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the filtering mode fm to be used when reading memory through the texture reference hTexRef. See . - Note that this call has no effect if hTexRef is bound to linear memory. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Filtering mode to set + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies optional flags via Flags to specify the behavior of data returned through the texture reference hTexRef. See . + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Optional flags to set + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pdptr the base address bound to the texture reference hTexRef, or returns - if the texture reference is not bound to any device memory range. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned device address - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in phArray the CUDA array bound to the texture reference hTexRef, or returns - if the texture reference is not bound to any CUDA array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned array - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in phMipmappedArray the CUDA mipmapped array bound to the texture - reference hTexRef, or returns if the texture reference - is not bound to any CUDA mipmapped array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned mipmapped array - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pam the addressing mode corresponding to the dimension dim of the texture reference hTexRef. Currently, - the only valid value for dim are 0 and 1. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned addressing mode - Texture reference - Dimension + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pfm the filtering mode of the texture reference hTexRef. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned filtering mode - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pFormat and pNumChannels the format and number of components of the CUDA array bound to - the texture reference hTexRef. If pFormat or pNumChannels is null, it will be ignored. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned format - Returned number of components - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in pFlags the flags of the texture reference hTexRef. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned flags - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns the mipmap filtering mode in pfm that's used when reading memory through - the texture reference hTexRef. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned mipmap filtering mode - Texture reference - + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns the mipmap level bias in pBias that's added to the specified mipmap - level when reading memory through the texture reference hTexRef. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned mipmap level bias - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns the min/max mipmap level clamps in pminMipmapLevelClamp and pmaxMipmapLevelClamp - that's used when reading memory through the texture reference hTexRef. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned mipmap min level clamp - Returned mipmap max level clamp - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Returns the maximum aniostropy in pmaxAniso that's used when reading memory through - the texture reference. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned maximum anisotropy - Texture reference + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the mipmap filtering mode fm to be used when reading memory through - the texture reference hTexRef. - Note that this call has no effect if hTexRef is not bound to a mipmapped array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Filtering mode to set + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the mipmap level bias bias to be added to the specified mipmap level when - reading memory through the texture reference hTexRef. - Note that this call has no effect if hTexRef is not bound to a mipmapped array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Mipmap level bias + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the min/max mipmap level clamps, minMipmapLevelClamp and maxMipmapLevelClamp - respectively, to be used when reading memory through the texture reference - hTexRef. - Note that this call has no effect if hTexRef is not bound to a mipmapped array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Mipmap min level clamp - Mipmap max level clamp + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Specifies the maximum aniostropy maxAniso to be used when reading memory through - the texture reference hTexRef. - Note that this call has no effect if hTexRef is not bound to a mipmapped array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - Maximum anisotropy + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , - , . + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the border color for a texture reference - Specifies the value of the RGBA color via the \p pBorderColor to the texture reference - \p hTexRef. The color value supports only float type and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component - - Note that the color values can be set only when the Address mode is set to - CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. - Applications using integer border color values have to "reinterpret_cast" their values to float. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Texture reference - RGBA color + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Gets the border color used by a texture reference - Returns in \p pBorderColor, values of the RGBA color used by - the texture reference \p hTexRef. - The color value is of type float and holds color components in - the following sequence: - pBorderColor[0] holds 'R' component - pBorderColor[1] holds 'G' component - pBorderColor[2] holds 'B' component - pBorderColor[3] holds 'A' component + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Returned Type and Value of RGBA color - Texture reference - + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Combines all surface management API calls + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Sets the CUDA array hArray to be read and written by the surface reference hSurfRef. Any previous CUDA array - state associated with the surface reference is superseded by this function. Flags must be set to . The - flag must have been set for the CUDA array. Any CUDA array previously bound to - hSurfRef is unbound. - - Surface reference handle - CUDA array handle - set to + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. + + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Returns in phArray the CUDA array bound to the surface reference hSurfRef, or returns - if the surface reference is not bound to any CUDA array. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Surface reference handle - Surface reference handle + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Combines all kernel / function parameter management API calls + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + CUDA Error Codes: , , , + , . + Note that this function may also return error codes from previous, asynchronous launches. - + - Sets through numbytes the total size in bytes needed by the function parameters of the kernel corresponding to - hfunc. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to set parameter size for - Size of parameter list in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets an integer parameter that will be specified the next time the kernel corresponding to hfunc will be invoked. - offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add parameter to - Offset to add parameter to argument list - Value of parameter + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Sets a floating-point parameter that will be specified the next time the kernel corresponding to hfunc will be invoked. - offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add parameter to - Offset to add parameter to argument list - Value of parameter + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to another. dstArray and srcArray specify the handles of the destination and + source CUDA arrays for the copy, respectively. dstOffset and srcOffset specify the destination and source + offsets in bytes into the CUDA arrays. ByteCount is the number of bytes to be copied. The size of the elements + in the CUDA arrays need not be the same format, but the elements must be the same size; and count must be evenly + divisible by that size. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination array + Offset in bytes of destination array + Source array + Offset in bytes of source array + Size of memory copy in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 2D memory copy according to the parameters specified in pCopy. See . + returns an error if any pitch is greater than the maximum allowed (). + passes back pitches that always work with . On intra-device + memory copies (device ]]> device, CUDA array ]]> device, CUDA array ]]> CUDA array), may fail + for pitches not computed by . does not have this restriction, but + may run significantly slower in the cases where would have returned an error code. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 2D memory copy according to the parameters specified in pCopy. See . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 3D memory copy according to the parameters specified in pCopy. See . + The srcLOD and dstLOD members of the CUDAMemCpy3D structure must be set to 0. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. + + + + Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost). + memcpy's done with these functions execute in parallel with the CPU and, if + the hardware is available, may execute in parallel with the GPU. + Asynchronous memcpy must be accompanied by appropriate stream synchronization. + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies data between two pointers. + dst and src are base pointers of the destination and source, respectively. + ByteCount specifies the number of bytes to copy. + Note that this function infers the type of the transfer (host to host, host to + device, device to device, or device to host) from the pointer values. This + function is only allowed in contexts which support unified addressing. + Note that this function is asynchronous and can optionally be associated to + a stream by passing a non-zero hStream argument - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination unified virtual address space pointer + Source unified virtual address space pointer + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from device memory in one context to device memory in another + context. dstDevice is the base device pointer of the destination memory + and dstContext is the destination context. srcDevice is the base + device pointer of the source memory and srcContext is the source pointer. + ByteCount specifies the number of bytes to copy. Note that this function + is asynchronous with respect to the host and all work in other streams in + other devices. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Destination context + Source device pointer + Source context + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 3D memory copy according to the parameters specified in + pCopy. See the definition of the structure + for documentation of its parameters. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + + + Performs a batch of memory copies asynchronously. + Performs a batch of memory copies.The batch as a whole executes in stream order but copies within a + batch are not guaranteed to execute in any specific order.This API only supports pointer-to-pointer copies. + For copies involving CUDA arrays, please see ::cuMemcpy3DBatchAsync. + Performs memory copies from source buffers specified in \p srcs to destination buffers specified in \p dsts. + The size of each copy is specified in \p sizes. All three arrays must be of the same length as specified + by \p count. Since there are no ordering guarantees for copies within a batch, specifying any dependent copies + within a batch will result in undefined behavior. + Every copy in the batch has to be associated with a set of attributes specified in the \p attrs array. + Each entry in this array can apply to more than one copy. This can be done by specifying in the \p attrsIdxs array, + the index of the first copy that the corresponding entry in the \p attrs array applies to.Both \p attrs and + \p attrsIdxs must be of the same length as specified by \p numAttrs. For example, if a batch has 10 copies listed + in dst/src/sizes, the first 6 of which have one set of attributes and the remaining 4 another, then \p numAttrs + will be 2, \p attrsIdxs will be { 0, 6} + and \p attrs will contains the two sets of attributes.Note that the first entry + in \p attrsIdxs must always be 0. Also, each entry must be greater than the previous entry and the last entry should be + less than \p count.Furthermore, \p numAttrs must be lesser than or equal to \p count. + + The::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated + with the attribute.If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will + be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then + it indicates that access to the source pointer can be out of stream order and all accesses must be complete before + the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior + operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope + that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the + need for the user to synchronize the stream after the API call. If the source access order is set to + ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the + accesses can happen even after the API call returns. This flag is suited for host pointers allocated + outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory. + Specifying this flag allows the driver to optimize the copy on certain platforms.Each memcpy operation in the batch must + have a valid ::CUmemcpyAttributes corresponding to it including the appropriate srcAccessOrder setting, otherwise the API + will return ::CUDA_ERROR_INVALID_VALUE. /// + The ::CUmemcpyAttributes::srcLocHint and ::CUmemcpyAttributes::dstLocHint allows applications to specify hint locations + for operands of a copy when the operand doesn't have a fixed location. That is, these hints are + only applicable for managed memory pointers on devices where ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or + system-allocated pageable memory on devices where ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true. + For other cases, these hints are ignored. + The::CUmemcpyAttributes::flags field can be used to specify certain flags for copies.Setting the + ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with + any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy. + If any error is encountered while parsing the batch, the index within the batch where the error was encountered + will be returned in \p failIdx. + + Array of destination pointers. + Array of memcpy source pointers. + Array of sizes for memcpy operations. + Size of \p dsts, \p srcs and \p sizes arrays + Array of memcpy attributes. + Array of indices to specify which copies each entry in the \p attrs array applies to. + The attributes specified in attrs[k] will be applied to copies starting from attrsIdxs[k] + through attrsIdxs[k + 1] - 1. Also attrs[numAttrs - 1] will apply to copies starting from + attrsIdxs[numAttrs - 1] through count - 1. + Size of \p attrs and \p attrsIdxs arrays. + Pointer to a location to return the index of the copy where a failure was encountered. The value will be SIZE_MAX if the error doesn't pertain to any specific copy. + The stream to enqueue the operations in. Must not be legacy NULL stream. + + + + + Performs a batch of 3D memory copies asynchronously. + Performs a batch of memory copies.The batch as a whole executes in stream order but copies within a + batch are not guaranteed to execute in any specific order.Note that this means specifying any dependent + copies within a batch will result in undefined behavior. + + Performs memory copies as specified in the \p opList array.The length of this array is specified in \p numOps. + Each entry in this array describes a copy operation.This includes among other things, the source and destination + operands for the copy as specified in ::CUDA_MEMCPY3D_BATCH_OP::src and ::CUDA_MEMCPY3D_BATCH_OP::dst respectively. + The source and destination operands of a copy can either be a pointer or a CUDA array.The width, height and depth + of a copy is specified in ::CUDA_MEMCPY3D_BATCH_OP::extent.The width, height and depth of a copy are specified in + elements and must not be zero.For pointer-to-pointer copies, the element size is considered to be 1. For pointer + to CUDA array or vice versa copies, the element size is determined by the CUDA array.For CUDA array to CUDA array copies, + the element size of the two CUDA arrays must match. + + For a given operand, if ::CUmemcpy3DOperand::type is specified as ::CU_MEMCPY_OPERAND_TYPE_POINTER, then + ::CUmemcpy3DOperand::op::ptr will be used. The::CUmemcpy3DOperand::op::ptr::ptr field must contain the pointer where + the copy should begin.The ::CUmemcpy3DOperand::op::ptr::rowLength field specifies the length of each row in elements and + must either be zero or be greater than or equal to the width of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::width. + The ::CUmemcpy3DOperand::op::ptr::layerHeight field specifies the height of each layer and must either be zero or be greater than + or equal to the height of the copy specified in ::CUDA_MEMCPY3D_BATCH_OP::extent::height.When either of these values is zero, + that aspect of the operand is considered to be tightly packed according to the copy extent. For managed memory pointers on devices where + ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS is true or system-allocated pageable memory on devices where + ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS is true, the::CUmemcpy3DOperand::op::ptr::locHint field can be used to hint + the location of the operand. + + If an operand's type is specified as ::CU_MEMCPY_OPERAND_TYPE_ARRAY, then ::CUmemcpy3DOperand::op::array will be used. + The::CUmemcpy3DOperand::op::array::array field specifies the CUDA array and::CUmemcpy3DOperand::op::array::offset specifies + the 3D offset into that array where the copy begins. + + The::CUmemcpyAttributes::srcAccessOrder indicates the source access ordering to be observed for copies associated + with the attribute.If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_STREAM, then the source will + be accessed in stream order. If the source access order is set to ::CU_MEMCPY_SRC_ACCESS_ORDER_DURING_API_CALL then + it indicates that access to the source pointer can be out of stream order and all accesses must be complete before + the API call returns. This flag is suited for ephemeral sources (ex., stack variables) when it's known that no prior + operations in the stream can be accessing the memory and also that the lifetime of the memory is limited to the scope + that the source variable was declared in. Specifying this flag allows the driver to optimize the copy and removes the + need for the user to synchronize the stream after the API call. If the source access order is set to + ::CU_MEMCPY_SRC_ACCESS_ORDER_ANY then it indicates that access to the source pointer can be out of stream order and the + accesses can happen even after the API call returns. This flag is suited for host pointers allocated + outside CUDA (ex., via malloc) when it's known that no prior operations in the stream can be accessing the memory. + Specifying this flag allows the driver to optimize the copy on certain platforms.Each memcopy operation in \p opList must + have a valid srcAccessOrder setting, otherwise this API will return ::CUDA_ERROR_INVALID_VALUE. + + The ::CUmemcpyAttributes::flags field can be used to specify certain flags for copies.Setting the + ::CU_MEMCPY_FLAG_PREFER_OVERLAP_WITH_COMPUTE flag indicates that the associated copies should preferably overlap with + any compute work. Note that this flag is a hint and can be ignored depending on the platform and other parameters of the copy. + + If any error is encountered while parsing the batch, the index within the batch where the error was encountered + will be returned in \p failIdx. + + Total number of memcpy operations. + Array of size \p numOps containing the actual memcpy operations. + Pointer to a location to return the index of the copy where a failure was encountered. The value will be SIZE_MAX if the error doesn't pertain to any specific copy. + Flags for future use, must be zero now. + The stream to enqueue the operations in. Must not be default NULL stream. + + + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from host memory to device memory. dstDevice and srcHost are the base addresses of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. + is asynchronous and can optionally be associated to a stream by passing a non-zero hStream + argument. It only works on page-locked memory and returns an error if a pointer to pageable memory is passed as + input. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source host pointer + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from device to host memory. dstHost and srcDevice specify the base pointers of the destination and + source, respectively. ByteCount specifies the number of bytes to copy. + is asynchronous and can optionally be associated to a stream by passing a non-zero + hStream argument. It only works on page-locked memory and returns an error if a pointer to pageable memory + is passed as input. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination host pointer + Source device pointer + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from device memory to device memory. dstDevice and srcDevice are the base pointers of the destination + and source, respectively. ByteCount specifies the number of bytes to copy. Note that this function is asynchronous + and can optionally be associated to a stream by passing a non-zero hStream argument. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Source device pointer + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from host memory to a 1D CUDA array. dstArray and dstOffset specify the CUDA array handle and + starting offset in bytes of the destination data. srcHost specifies the base address of the source. ByteCount + specifies the number of bytes to copy. + is asynchronous and can optionally be associated to a stream by passing a non-zero + hStream argument. It only works on page-locked memory and returns an error if a pointer to pageable memory + is passed as input. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination array + Offset in bytes of destination array + Source host pointer + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Copies from one 1D CUDA array to host memory. dstHost specifies the base pointer of the destination. srcArray + and srcOffset specify the CUDA array handle and starting offset in bytes of the source data. ByteCount specifies + the number of bytes to copy. + is asynchronous and can optionally be associated to a stream by passing a non-zero stream hStream + argument. It only works on page-locked host memory and returns an error if a pointer to pageable memory is passed + as input. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination pointer + Source array + Offset in bytes of source array + Size of memory copy in bytes + Stream identifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 2D memory copy according to the parameters specified in pCopy. See . + returns an error if any pitch is greater than the maximum allowed (). + passes back pitches that always work with . On intra-device + memory copies (device ]]> device, CUDA array ]]> device, CUDA array ]]> CUDA array), may fail + for pitches not computed by . (not async!) does not have this restriction, but + may run significantly slower in the cases where would have returned an error code. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Perform a 3D memory copy according to the parameters specified in pCopy. See . + returns an error if any pitch is greater than the maximum allowed (). + is asynchronous and can optionally be associated to a stream by passing a non-zero hStream + argument. It only works on page-locked host memory and returns an error if a pointer to pageable memory is passed + as input. + The srcLOD and dstLOD members of the CUDAMemCpy3D structure must be set to 0. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Parameters for the memory copy + Stream indetifier CUDA Error Codes: , , , , . - Note that this function may also return error codes from previous, asynchronous launches. + Note that this function may also return error codes from previous, asynchronous launches. + + + + Submit a batch of \p count independent decompression operations. + Each of the \p count decompression operations is described by a + single entry in the \p paramsArray array.Once the batch has been + submitted, the function will return, and decompression will happen + asynchronously w.r.t.the CPU.To the work completion tracking + mechanisms in the CUDA driver, the batch will be considered a single + unit of work and processed according to stream semantics, i.e., it + is not possible to query the completion of individual decompression + operations within a batch. + The memory pointed to by each of ::CUmemDecompressParams.src, + ::CUmemDecompressParams.dst, and::CUmemDecompressParams.dstActBytes, + must be capable of usage with the hardware decompress feature.That + is, for each of said pointers, the pointer attribute + ::CU_POINTER_ATTRIBUTE_IS_MEM_DECOMPRESS_CAPABLE should give a + non-zero value. To ensure this, the memory backing the pointers + should have been allocated using one of the following CUDA memory + allocators: + ::cuMemAlloc() + ::cuMemCreate() with the usage flag::CU_MEM_CREATE_USAGE_HW_DECOMPRESS + ::cuMemAllocFromPoolAsync() from a pool that was created with + the usage flag ::CU_MEM_POOL_CREATE_USAGE_HW_DECOMPRESS + Additionally, ::CUmemDecompressParams.src, ::CUmemDecompressParams.dst, + and ::CUmemDecompressParams.dstActBytes, must all be accessible from + the device associated with the context where \p stream was created. + For information on how to ensure this, see the documentation for the + allocator of interest. + + The array of structures describing the independent decompression operations. + The number of entries in \p paramsArray array. + Must be 0. + The index into \p paramsArray of the decompression operation for which the error returned by this + function pertains to.If \p index is SIZE_MAX and the value returned is not::CUDA_SUCCESS, then the + error returned by this function should be considered a general error that does not pertain to a + particular decompression operation. May be \p NULL, in which case, no index will be recorded in the + event of error. + The stream where the work will be enqueued. + + + + + Combines all memset API calls + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 8-bit values to the specified value b. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 16-bit values to the specified value us. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 32-bit values to the specified value ui. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 8-bit values to the specified value b. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 16-bit values to the specified value us. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 32-bit values to the specified value us. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + + + Combines all async memset API calls + + + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 8-bit values to the specified value b. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 16-bit values to the specified value us. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the memory range of N 32-bit values to the specified value ui. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Value to set + Number of elements + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 8-bit values to the specified value b. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 16-bit values to the specified value us. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the 2D memory range of Width 32-bit values to the specified value us. Height specifies the number of rows to + set, and dstPitch specifies the number of bytes between each row. This function performs fastest when the pitch is + one that has been passed back by . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Destination device pointer + Pitch of destination device pointer + Value to set + Width of row + Number of rows + Stream identifier CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Combines all function / kernel API calls - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the x, y, and z dimensions of the thread blocks that are created when the kernel given by hfunc is launched. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Kernel to specify dimensions of + X dimension + Y dimension + Z dimension CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets through bytes the amount of dynamic shared memory that will be available to each thread block when the kernel + given by hfunc is launched. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Kernel to specify dynamic shared-memory size for + Dynamic shared-memory size per thread in bytes CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pi the integer value of the attribute attrib on the kernel given by hfunc. See . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned attribute value + Attribute requested + Function to query attribute of CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets information about a function + + This call sets the value of a specified attribute \p attrib on the kernel given + by \p hfunc to an integer value specified by \p val + + This function returns CUDA_SUCCESS if the new value of the attribute could be + successfully set. If the set fails, this call will return an error. + + Not all attributes can have values set. Attempting to set a value on a read-only + attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + + Supported attributes for the cuFuncSetAttribute call are: + + ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of + dynamically-allocated shared memory.The value should contain the requested + maximum size of dynamically-allocated shared memory.The sum of this value and + the function attribute::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + The maximal size of requestable dynamic shared memory may differ by GPU + architecture. + + ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + cache and shared memory use the same hardware resources, this sets the shared memory + carveout preference, in percent of the total resources.This is only a hint, and the + driver can choose a different ratio if required to execute the function. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Function to query attribute of + Attribute requested + The value to set - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + On devices where the L1 cache and shared memory use the same hardware resources, this sets through config + the preferred cache configuration for the device function hfunc. This is only a preference. The driver will use the + requested configuration if possible, but it is free to choose a different configuration if required to execute hfunc. + This setting does nothing on devices where the size of the L1 cache and shared memory are fixed. + Switching between configuration modes may insert a device-side synchronization point for streamed kernel launches. + The supported cache modes are defined in - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Kernel to configure cache for + Requested cache configuration CUDA Error Codes: , , , - , . + . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the shared memory configuration for a device function. + On devices with configurable shared memory banks, this function will + force all subsequent launches of the specified device function to have + the given shared memory bank size configuration. On any given launch of the + function, the shared memory configuration of the device will be temporarily + changed if needed to suit the function's preferred configuration. Changes in + shared memory configuration between subsequent launches of functions, + may introduce a device side synchronization point. + Any per-function setting of shared memory bank size set via + will override the context wide setting set with + . + Changing the shared memory bank size will not increase shared memory usage + or affect occupancy of kernels, but may have major effects on performance. + Larger bank sizes will allow for greater potential bandwidth to shared memory, + but will change what kinds of accesses to shared memory will result in bank + conflicts. + This function will do nothing on devices with fixed shared memory bank size. + The supported bank configurations are + - : set bank width to the default initial + setting (currently, four bytes). + - : set shared memory bank width to + be natively four bytes. + - : set shared memory bank width to + be natively eight bytes. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + kernel to be given a shared memory config + requested shared memory configuration + CUDA Error Codes: , , , , + . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns a module handle + Returns in \p *hmod the handle of the module that function \p hfunc + is located in. The lifetime of the module corresponds to the lifetime of + the context it was loaded in or until the module is explicitly unloaded. + The CUDA runtime manages its own modules loaded into the primary context. + If the handle returned by this API refers to a module loaded by the CUDA runtime, + calling ::cuModuleUnload() on that module will result in undefined behavior. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the function name for a ::CUfunction handle + Returns in \p **name the function name associated with the function handle \p hfunc. + The function name is returned as a null-terminated string. The returned name is only + valid when the function handle is valid.If the module is unloaded or reloaded, one + must call the API again to get the updated name.This API may return a mangled name if + the function is not declared as having C linkage.If either \p** name or \p hfunc + is NULL, ::CUDA_ERROR_INVALID_VALUE is returned. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + The returned name of the function + The function handle to retrieve the name for - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the offset and size of a kernel parameter in the device-side parameter layout + Queries the kernel parameter at \p paramIndex into \p func's list of parameters, and returns + in \p paramOffset and \p paramSize the offset and size, respectively, where the parameter + will reside in the device-side parameter layout.This information can be used to update kernel + node parameters from the device via ::cudaGraphKernelNodeSetParam() and + ::cudaGraphKernelNodeUpdatesApply(). \p paramIndex must be less than the number of parameters + that \p func takes. \p paramSize can be set to NULL if only the parameter offset is desired. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + The function to query + The parameter index to query + Returns the offset into the device-side parameter layout at which the parameter resides + Optionally returns the size of the parameter in the device-side parameter layout - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns if the function is loaded + Returns in \p state the loading state of \p function. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + returned loading state + the function to check - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Loads a function + Finalizes function loading for \p function.Calling this API with afully loaded function has no effect. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + the function to load - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Combines all array management API calls - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Creates a CUDA array according to the structure pAllocateArray and returns a + handle to the new CUDA array in pHandle. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned array + Array descriptor CUDA Error Codes: , , , - , . + , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pArrayDescriptor a descriptor containing information on the format and dimensions of the CUDA + array hArray. It is useful for subroutines that have been passed a CUDA array, but need to know the CUDA array + parameters for validation or other purposes. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned array descriptor + Array to get descriptor of CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the layout properties of a sparse CUDA array + Returns the layout properties of a sparse CUDA array in \p sparseProperties + If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE ::CUDA_ERROR_INVALID_VALUE will be returned. + If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + then::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array.Otherwise, it will be zero. + Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. + Note that the \p array must have been allocated using ::cuArrayCreate or::cuArray3DCreate.For CUDA arrays obtained + using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.Instead, ::cuMipmappedArrayGetSparseProperties + must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + CUDA array to get the sparse properties of + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the layout properties of a sparse CUDA mipmapped array + Returns the sparse array layout properties in \p sparseProperties + If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + ::CUDA_ERROR_INVALID_VALUE will be returned. + For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the + size of the mip tail region.The mip tail region includes all mip levels whose width, height or depth + is less than that of the tile. + For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. + Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. + The returned value of::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + CUDA mipmapped array to get the sparse properties of + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Gets a CUDA array plane from a CUDA array + Returns in \p pPlaneArray a CUDA array that represents a single format plane + of the CUDA array \p hArray. + If \p planeIdx is greater than the maximum number of planes in this array or if the array does + not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then::CUDA_ERROR_INVALID_VALUE is returned. + Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns + a CUDA array of the same size as \p hArray but with one channel and::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned CUDA array referenced by the planeIdx + Multiplanar CUDA array + Plane index + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Destroys the CUDA array hArray. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Array to destroy CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Creates a CUDA array according to the structure pAllocateArray and returns + a handle to the new CUDA array in pHandle. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned array + 3D array descriptor CUDA Error Codes: , , , - , . + , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pArrayDescriptor a descriptor containing information on the format and dimensions of the CUDA + array hArray. It is useful for subroutines that have been passed a CUDA array, but need to know the CUDA array + parameters for validation or other purposes. + This function may be called on 1D and 2D arrays, in which case the Height and/or Depth members of the descriptor + struct will be set to 0. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned 3D array descriptor + 3D array to get descriptor of CUDA Error Codes: , , , - , . + , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in pHandle. + numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned mipmapped array + mipmapped array descriptor + Number of mipmap levels CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , , , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pLevelArray a CUDA array that represents a single mipmap level + of the CUDA mipmapped array hMipmappedArray. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned mipmap level CUDA array + CUDA mipmapped array + Mipmap level CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Destroys the CUDA mipmapped array hMipmappedArray. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Mipmapped array to destroy CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , , . + + + + Returns the memory requirements of a CUDA array + Returns the memory requirements of a CUDA array in \p memoryRequirements + If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + ::CUDA_ERROR_INVALID_VALUE will be returned. + The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + represents the total size of the CUDA array. + The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + represents the alignment necessary for mapping the CUDA array. + + Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + CUDA array to get the memory requirements of + Device to get the memory requirements for + CUDA Error Codes + + + + Returns the memory requirements of a CUDA mipmapped array + Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements + If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + ::CUDA_ERROR_INVALID_VALUE will be returned. + The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + represents the total size of the CUDA mipmapped array. + The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + represents the alignment necessary for mapping the CUDA mipmapped + array. + + Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + CUDA mipmapped array to get the memory requirements of + Device to get the memory requirements for + CUDA Error Codes - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Groups all texture reference management API calls - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Creates a texture reference and returns its handle in pTexRef. Once created, the application must call + or to associate the reference with allocated memory. Other texture reference functions + are used to specify the format and interpretation (addressing, filtering, etc.) to be used when the memory is read + through this texture reference. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Destroys the texture reference specified by hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference to destroy CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Binds the CUDA array hArray to the texture reference hTexRef. Any previous address or CUDA array state + associated with the texture reference is superseded by this function. Flags must be set to + . Any CUDA array previously bound to hTexRef is unbound. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference to bind + Array to bind + Options (must be ) CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Binds the CUDA mipmapped array hMipmappedArray to the texture reference hTexRef. + Any previous address or CUDA array state associated with the texture reference + is superseded by this function. Flags must be set to . + Any CUDA array previously bound to hTexRef is unbound. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference to bind + Mipmapped array to bind + Options (must be ) CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Binds a linear address range to the texture reference hTexRef. Any previous address or CUDA array state associated + with the texture reference is superseded by this function. Any memory previously bound to hTexRef is unbound. + Since the hardware enforces an alignment requirement on texture base addresses, passes back + a byte offset in ByteOffset that must be applied to texture fetches in order to read from the desired memory. This + offset must be divided by the texel size and passed to kernels that read from the texture so they can be applied to the + tex1Dfetch() function. + If the device memory pointer was returned from , the offset is guaranteed to be 0 and null may be + passed as the ByteOffset parameter. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned byte offset + Texture reference to bind + Device pointer to bind + Size of memory to bind in bytes CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Binds a linear address range to the texture reference hTexRef. Any previous address or CUDA array state associated + with the texture reference is superseded by this function. Any memory previously bound to hTexRef is unbound. + Using a tex2D() function inside a kernel requires a call to either to bind the corresponding texture + reference to an array, or to bind the texture reference to linear memory. + Function calls to cannot follow calls to for the same texture reference. + It is required that dptr be aligned to the appropriate hardware-specific texture alignment. You can query this value + using the device attribute . If an unaligned dptr is supplied, + is returned. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference to bind + Descriptor of CUDA array + Device pointer to bind + Line pitch in bytes> CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the format of the data to be read by the texture reference hTexRef. fmt and NumPackedComponents + are exactly analogous to the Format and NumChannels members of the structure: + They specify the format of each component and the number of components per array element. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Format to set + Number of components per array element CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the addressing mode am for the given dimension dim of the texture reference hTexRef. If dim is zero, + the addressing mode is applied to the first parameter of the functions used to fetch from the texture; if dim is 1, the + second, and so on. See . + Note that this call has no effect if hTexRef is bound to linear memory. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Dimension + Addressing mode to set CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the filtering mode fm to be used when reading memory through the texture reference hTexRef. See . + Note that this call has no effect if hTexRef is bound to linear memory. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Filtering mode to set CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies optional flags via Flags to specify the behavior of data returned through the texture reference hTexRef. See . - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Optional flags to set CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pdptr the base address bound to the texture reference hTexRef, or returns + if the texture reference is not bound to any device memory range. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned device address + Texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in phArray the CUDA array bound to the texture reference hTexRef, or returns + if the texture reference is not bound to any CUDA array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned array + Texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in phMipmappedArray the CUDA mipmapped array bound to the texture + reference hTexRef, or returns if the texture reference + is not bound to any CUDA mipmapped array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned mipmapped array + Texture reference CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pam the addressing mode corresponding to the dimension dim of the texture reference hTexRef. Currently, + the only valid value for dim are 0 and 1. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned addressing mode + Texture reference + Dimension CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pfm the filtering mode of the texture reference hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned filtering mode + Texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pFormat and pNumChannels the format and number of components of the CUDA array bound to + the texture reference hTexRef. If pFormat or pNumChannels is null, it will be ignored. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned format + Returned number of components + Texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns in pFlags the flags of the texture reference hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned flags + Texture reference CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the mipmap filtering mode in pfm that's used when reading memory through + the texture reference hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned mipmap filtering mode + Texture reference + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the mipmap level bias in pBias that's added to the specified mipmap + level when reading memory through the texture reference hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned mipmap level bias + Texture reference CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the min/max mipmap level clamps in pminMipmapLevelClamp and pmaxMipmapLevelClamp + that's used when reading memory through the texture reference hTexRef. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned mipmap min level clamp + Returned mipmap max level clamp + Texture reference CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Returns the maximum aniostropy in pmaxAniso that's used when reading memory through + the texture reference. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Returned maximum anisotropy + Texture reference CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the mipmap filtering mode fm to be used when reading memory through + the texture reference hTexRef. + Note that this call has no effect if hTexRef is not bound to a mipmapped array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Filtering mode to set CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the mipmap level bias bias to be added to the specified mipmap level when + reading memory through the texture reference hTexRef. + Note that this call has no effect if hTexRef is not bound to a mipmapped array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Mipmap level bias CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the min/max mipmap level clamps, minMipmapLevelClamp and maxMipmapLevelClamp + respectively, to be used when reading memory through the texture reference + hTexRef. + Note that this call has no effect if hTexRef is not bound to a mipmapped array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Mipmap min level clamp + Mipmap max level clamp CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Specifies the maximum aniostropy maxAniso to be used when reading memory through + the texture reference hTexRef. + Note that this call has no effect if hTexRef is not bound to a mipmapped array. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Texture reference + Maximum anisotropy CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + , . - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the border color for a texture reference + Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + \p hTexRef. The color value supports only float type and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component + + Note that the color values can be set only when the Address mode is set to + CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. + Applications using integer border color values have to "reinterpret_cast" their values to float. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Texture reference + RGBA color - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Gets the border color used by a texture reference + Returns in \p pBorderColor, values of the RGBA color used by + the texture reference \p hTexRef. + The color value is of type float and holds color components in + the following sequence: + pBorderColor[0] holds 'R' component + pBorderColor[1] holds 'G' component + pBorderColor[2] holds 'B' component + pBorderColor[3] holds 'A' component - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. + Returned Type and Value of RGBA color + Texture reference + - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Combines all surface management API calls - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - + - Copies an arbitrary amount of data (specified in numbytes) from ptr into the parameter space of the kernel corresponding - to hfunc. offset is a byte offset. + Sets the CUDA array hArray to be read and written by the surface reference hSurfRef. Any previous CUDA array + state associated with the surface reference is superseded by this function. Flags must be set to . The + flag must have been set for the CUDA array. Any CUDA array previously bound to + hSurfRef is unbound. - Kernel to add data to - Offset to add data to argument list - Pointer to arbitrary data - Size of data to copy in bytes + Surface reference handle + CUDA array handle + set to CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. - + - Makes the CUDA array or linear memory bound to the texture reference hTexRef available to a device program as a - texture. In this version of CUDA, the texture-reference must be obtained via and the texunit - parameter must be set to . + Returns in phArray the CUDA array bound to the surface reference hSurfRef, or returns + if the surface reference is not bound to any CUDA array. - Kernel to add texture-reference to - Texture unit (must be ) - Texture-reference to add to argument list + Surface reference handle + Surface reference handle CUDA Error Codes: , , , , . Note that this function may also return error codes from previous, asynchronous launches. @@ -96380,6 +117847,92 @@ The function to call once preceding stream operations are complete User-specified data to be passed to the function + + + Launches a CUDA function ::CUfunction or a CUDA kernel ::CUkernel with launch-time configuration + Invokes the function::CUfunction or the kernel ::CUkernel \p f with the specified launch-time configuration \p config. + The::CUlaunchConfig structure is defined as: + \code + typedef struct CUlaunchConfig_st + { + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + CUlaunchAttribute* attrs; + unsigned int numAttrs; + } CUlaunchConfig; + \endcode + where: + - ::CUlaunchConfig::gridDimX is the width of the grid in blocks. + - ::CUlaunchConfig::gridDimY is the height of the grid in blocks. + - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks. + - ::CUlaunchConfig::blockDimX is the X dimension of each thread block. + - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block. + - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block. + - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. + - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch + in. The CUDA context associated with this stream must match that associated with function f. + - ::CUlaunchConfig::attrs is an array of::CUlaunchConfig::numAttrs continguous ::CUlaunchAttribute elements. The value of this pointer is not + considered if ::CUlaunchConfig::numAttrs is zero.However, in that case, it is recommended to set the pointer to NULL. + - ::CUlaunchConfig::numAttrs is the numbers of attributes populating the first::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs array. + + Launch-time configuration is specified by adding entries to ::CUlaunchConfig::attrs.Each entry is an attribute ID and a corresponding attribute value. + + The::CUlaunchAttribute structure is defined as: + \code + typedef struct CUlaunchAttribute_st + { + CUlaunchAttributeID id; + CUlaunchAttributeValue value; + } CUlaunchAttribute; + \endcode + where: + - ::CUlaunchAttribute::id is a unique enum identifying the attribute. + - ::CUlaunchAttribute::value is a union that hold the attribute value. + + Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the + kernel launch to be a cooperative launch, with exactly the same usage and + semantics of ::cuLaunchCooperativeKernel. + Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero + values causes the kernel to use programmatic means to resolve its stream + dependency -- enabling the CUDA runtime to opportunistically allow the grid's + execution to overlap with the previous kernel in the stream, if that kernel + requests the overlap. + ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the + kernel launch.Event recorded through this launch attribute is guaranteed to + only trigger after all block in the associated kernel trigger the event. A + block can trigger the event through PTX launchdep.release or CUDA builtin + function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be + inserted at the beginning of each block's execution if triggerAtBlockStart is + set to non-0. Note that dependents (including the CPU thread calling + cuEventSynchronize()) are not guaranteed to observe the release precisely + when it is released. For example, cuEventSynchronize() may only observe the + event trigger long after the associated kernel has completed. This recording + type is primarily meant for establishing programmatic dependency between + device tasks. The event supplied must not be an interprocess or interop + event. The event must disable timing (i.e. created with + ::CU_EVENT_DISABLE_TIMING flag set). + The effect of other attributes is consistent with their effect when set via + persistent APIs. + Kernel parameters to \p f can be specified in the same ways that they can be + using ::cuLaunchKernel. + Note that the API can also be used to launch context-less kernel ::CUkernel + by querying the handle using ::cuLibraryGetKernel() and then passing it + to the API by casting to ::CUfunction. Here, the context to launch + the kernel on will either be taken from the specified stream ::CUlaunchConfig::hStream + or the current context in case of NULL stream. + + Config to launch + Function ::CUfunction or Kernel ::CUkernel to launch + Array of pointers to kernel parameters + Extra options + + Groups all event API calls @@ -96410,6 +117963,28 @@ , , . Note that this function may also return error codes from previous, asynchronous launches. + + + Records an event + Captures in \p hEvent the contents of \p hStream at the time of this call. + \p hEvent and \p hStream must be from the same context. + Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + examine or wait for completion of the work that was captured.Uses of + \p hStream after this call do not modify \p hEvent. See note on default + stream behavior for what is captured in the default case. + ::cuEventRecordWithFlags() can be called multiple times on the same event and + will overwrite the previously captured state.Other APIs such as + ::cuStreamWaitEvent() use the most recently captured state at the time + of the API call, and are not affected by later calls to + ::cuEventRecordWithFlags(). Before the first call to::cuEventRecordWithFlags(), an + event represents an empty set of work, so for example::cuEventQuery() + would return ::CUDA_SUCCESS. + + Event to record + Stream to record event for + See ::CUevent_capture_flags + + Returns if the event has actually been recorded, or if not. If @@ -96432,20 +118007,11 @@ , , . Note that this function may also return error codes from previous, asynchronous launches. - - - Destroys the event specified by event. - - Event to destroy - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - Destroys the event specified by event. In the case that hEvent has been recorded but has not yet been completed - when is called, the function will return immediately and + when is called, the function will return immediately and the resources associated with hEvent will be released automatically once the device has completed hEvent. @@ -96467,6 +118033,34 @@ , , . Note that this function may also return error codes from previous, asynchronous launches. + + + Computes the elapsed time between two events + Computes the elapsed time between two events(in milliseconds with a + resolution of around 0.5 microseconds). Note this API is not guaranteed + to return the latest errors for pending work. As such this API is intended to + serve as an elapsed time calculation only and any polling for completion on the + events to be compared should be done with::cuEventQuery instead. + If either event was last recorded in a non-NULL stream, the resulting time + may be greater than expected(even if both used the same stream handle). This + happens because the ::cuEventRecord() operation takes place asynchronously + and there is no guarantee that the measured latency is actually just between + the two events.Any number of other different stream operations could execute + in between the two measured events, thus altering the timing in a significant + way. + If ::cuEventRecord() has not been called on either event then + ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + on both events but one or both of them has not yet been completed(that is, + ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + events), ::CUDA_ERROR_NOT_READY is returned.If either event was created with + the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + ::CUDA_ERROR_INVALID_HANDLE. + + Time between \p hStart and \p hEnd in ms + Starting event + Ending event + + Wait on a memory location @@ -96620,20 +118214,11 @@ , . Note that this function may also return error codes from previous, asynchronous launches. - - - Destroys the stream specified by hStream. - - Stream to destroy - CUDA Error Codes: , , , - , . - Note that this function may also return error codes from previous, asynchronous launches. - Destroys the stream specified by hStream. In the case that the device is still doing work in the stream hStream - when is called, the function will return immediately + when is called, the function will return immediately and the resources associated with hStream will be released automatically once the device has completed all work in hStream. @@ -96642,6 +118227,36 @@ , . Note that this function may also return error codes from previous, asynchronous launches. + + + Copies attributes from source stream to destination stream + Copies attributes from source stream \p src to destination stream \p dst. + Both streams must have the same context. + + Destination stream + Source stream + + + + Queries stream attribute. + Queries attribute \p attr from \p hStream and stores it in corresponding member of \p value_out. + + + + + + + + Sets stream attribute. + Sets attribute \p attr on \p hStream from corresponding attribute of + value.The updated attribute will be applied to subsequent work + submitted to the stream. It will not affect previously submitted work. + + + + + + Make a compute stream wait on an event @@ -96651,7 +118266,7 @@ The stream hStream will wait only for the completion of the most recent host call to on hEvent. Once this call has returned, - any functions (including and may be + any functions (including and may be called on hEvent again, and the subsequent calls will not have any effect on hStream. @@ -96733,6 +118348,15 @@ Pointer to a signed integer in which the stream's priority is returned + + + Returns the device handle of the stream + Returns in \p* device the device handle of the stream + + Handle to the stream to be queried + Returns the device to which a stream belongs + + Query the flags of a given stream @@ -96755,6 +118379,39 @@ Returned context associated with the stream + + + Query the contexts associated with a stream + Returns the contexts that the stream is associated with. + + If the stream is associated with a green context, the API returns the green context in \p pGreenCtx + and the primary context of the associated device in \p pCtx. + + If the stream is associated with a regular context, the API returns the regular context in \p pCtx + and NULL in \p pGreenCtx. + + The stream handle \p hStream can refer to any of the following: + + - a stream created via any of the CUDA driver APIs such as ::cuStreamCreate, + ::cuStreamCreateWithPriority and ::cuGreenCtxStreamCreate, or their runtime API equivalents such as + ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + Passing an invalid handle will result in undefined behavior. + - any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + If any of the special handles are specified, the API will operate on the context current to the + calling thread. If a green context (that was converted via::cuCtxFromGreenCtx() before setting it current) + is current to the calling thread, the API will return the green context in \p pGreenCtx + and the primary context of the associated device in \p pCtx.If a regular context is current, + the API returns the regular context in \p pCtx and NULL in \p pGreenCtx. + Note that specifying::CU_STREAM_PER_THREAD or ::cudaStreamPerThread will return ::CUDA_ERROR_INVALID_HANDLE + if a green context is current to the calling thread. + If no context is current to the calling thread, ::CUDA_ERROR_INVALID_CONTEXT is returned. + + Handle to the stream to be queried + Returned regular context associated with the stream + Returned green context if the stream is associated with a green context or NULL if not + Attach memory to a stream asynchronously @@ -96813,7 +118470,7 @@ Must be one of - + Begins graph capture on a stream Begin graph capture on \p hStream. When a stream is in capture mode, all operations @@ -96824,6 +118481,33 @@ mode.The capture mode may be queried via ::cuStreamIsCapturing. Stream in which to initiate capture + Controls the interaction of this capture sequence with other API calls that are potentially unsafe. For more details see ::cuThreadExchangeStreamCaptureMode. + Kernels captured using this API must not use texture and surface references. + Reading or writing through any texture or surface reference is undefined + behavior.This restriction does not apply to texture and surface objects. + + + + Begins graph capture on a stream to an existing graph + Begin graph capture on \p hStream, placing new nodes into an existing graph. When a stream is + in capture mode, all operations pushed into the stream will not be executed, but will instead + be captured into \p hGraph.The graph will not be instantiable until the user calls + ::cuStreamEndCapture. + Capture may not be initiated if \p stream is CU_STREAM_LEGACY.Capture must be ended on the + same stream in which it was initiated, and it may only be initiated if the stream is not + already in capture mode. The capture mode may be queried via::cuStreamIsCapturing.A unique id + representing the capture sequence may be queried via::cuStreamGetCaptureInfo. + If \p mode is not::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be + called on this stream from the same thread. + + Stream in which to initiate capture. + Graph to capture into. + Dependencies of the first node captured in the stream. Can be NULL if numDependencies is 0. + Optional array of data associated with each dependency. + Number of dependencies. + Controls the interaction of this capture sequence with other API + calls that are potentially unsafe. For more details see + ::cuThreadExchangeStreamCaptureMode. Kernels captured using this API must not use texture and surface references. Reading or writing through any texture or surface reference is undefined behavior.This restriction does not apply to texture and surface objects. @@ -96863,6 +118547,185 @@ Stream to query Returns the stream's capture status + + + Swaps the stream capture interaction mode for a thread + Sets the calling thread's stream capture interaction mode to the value contained + in \p* mode, and overwrites \p* mode with the previous mode for the thread.To + facilitate deterministic behavior across function or module boundaries, callers + are encouraged to use this API in a push-pop fashion: \code + CUstreamCaptureMode mode = desiredMode; + cuThreadExchangeStreamCaptureMode(&mode); + ... + cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode + \endcode + During stream capture(see::cuStreamBeginCapture), some actions, such as a call + to::cudaMalloc, may be unsafe. In the case of::cudaMalloc, the operation is + not enqueued asynchronously to a stream, and is not observed by stream capture. + Therefore, if the sequence of operations captured via ::cuStreamBeginCapture + depended on the allocation being replayed whenever the graph is launched, the + captured graph would be invalid. + Therefore, stream capture places restrictions on API calls that can be made within + or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This + behavior can be controlled via this API and flags to ::cuStreamBeginCapture. + A thread's mode is one of the following: + - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode.If the local thread has + an ongoing capture sequence that was not initiated with + \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread + has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, + this thread is prohibited from potentially unsafe API calls. + - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture + sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited + from potentially unsafe API calls.Concurrent capture sequences in other threads + are ignored. + - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially + unsafe API calls.Note that the thread is still prohibited from API calls which + necessarily conflict with stream capture, for example, attempting::cuEventQuery + on an event that was last recorded inside a capture sequence. + + + + + + Query a stream's capture state (11.3+) + Query stream state related to stream capture. + + If called on ::CU_STREAM_LEGACY(the "null stream") while a stream not created + with::CU_STREAM_NON_BLOCKING is capturing, returns::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + + Valid data(other than capture status) is returned only if both of the following are true: + - the call returns CUDA_SUCCESS + - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE + + + The stream to query + captureStatus_out - Location to return the capture status of the stream; required + Optional location to return an id for the capture sequence, which is unique over the lifetime of the process + Optional location to return the graph being captured into. All operations other than destroy and node removal are permitted on the graph + while the capture sequence is in progress.This API does not transfer + ownership of the graph, which is transferred or destroyed at + ::cuStreamEndCapture.Note that the graph handle may be invalidated before + end of capture for certain errors.Nodes that are or become + unreachable from the original stream at ::cuStreamEndCapture due to direct + actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. + Optional location to store a pointer to an array of nodes. The next node to be captured in the stream will depend on this set of nodes, + absent operations such as event wait which modify this set.The array pointer + is valid until the next API call which operates on the stream or until end of + capture. The node handles may be copied out and are valid until they or the + graph is destroyed.The driver-owned array may also be passed directly to + APIs that operate on the graph (not the stream) without copying. + Optional location to store the size of the array returned in dependencies_out. + + + + Query a stream's capture state (12.3+) + Query stream state related to stream capture. + If called on ::CU_STREAM_LEGACY(the "null stream") while a stream not created + with::CU_STREAM_NON_BLOCKING is capturing, returns::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + Valid data(other than capture status) is returned only if both of the following are true: + - the call returns CUDA_SUCCESS + - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE + If \p edgeData_out is non-NULL then \p dependencies_out must be as well.If + \p dependencies_out is non-NULL and \p edgeData_out is NULL, but there is non-zero edge + data for one or more of the current stream dependencies, the call will return + ::CUDA_ERROR_LOSSY_QUERY. + + The stream to query + Location to return the capture status of the stream; required + Optional location to return an id for the capture sequence, which is unique over the lifetime of the process + Optional location to return the graph being captured into. All + operations other than destroy and node removal are permitted on the graph + while the capture sequence is in progress.This API does not transfer + ownership of the graph, which is transferred or destroyed at + ::cuStreamEndCapture.Note that the graph handle may be invalidated before + end of capture for certain errors.Nodes that are or become + unreachable from the original stream at ::cuStreamEndCapture due to direct + actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. + Optional location to store a pointer to an array of nodes. + The next node to be captured in the stream will depend on this set of nodes, + absent operations such as event wait which modify this set.The array pointer + is valid until the next API call which operates on the stream or until the + capture is terminated.The node handles may be copied out and are valid until + they or the graph is destroyed.The driver-owned array may also be passed + directly to APIs that operate on the graph (not the stream) without copying. + Optional location to store a pointer to an array of graph edge + data.This array parallels \c dependencies_out; the next node to be added + has an edge to \c dependencies_out[i] with annotation \c edgeData_out[i] for + each \c i.The array pointer is valid until the next API call which operates + on the stream or until the capture is terminated. + Optional location to store the size of the array returned in dependencies_out. + + + + Update the set of dependencies in a capturing stream (11.3+) + Modifies the dependency set of a capturing stream. The dependency set is the set of nodes that the next captured node in the stream will depend on. + Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and + ::CU_STREAM_SET_CAPTURE_DEPENDENCIES.These control whether the set passed to + the API is added to the existing set or replaces it.A flags value of 0 defaults + to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. + Nodes that are removed from the dependency set via this API do not result in + ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at + ::cuStreamEndCapture. + Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. + This API is new in CUDA 11.3. Developers requiring compatibility across minor + versions to CUDA 11.0 should not use this API or provide a fallback. + + + + + + + + \brief Update the set of dependencies in a capturing stream (12.3+) + + Modifies the dependency set of a capturing stream. The dependency set is the set + of nodes that the next captured node in the stream will depend on along with the + edge data for those dependencies. + + Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and + ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to + the API is added to the existing set or replaces it. A flags value of 0 defaults + to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. + + Nodes that are removed from the dependency set via this API do not result in + ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at + ::cuStreamEndCapture. + + Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. + + \param hStream - The stream to update + \param dependencies - The set of dependencies to add + \param dependencyData - Optional array of data associated with each dependency. + \param numDependencies - The size of the dependencies array + \param flags - See above + + \return + ::CUDA_SUCCESS, + ::CUDA_ERROR_INVALID_VALUE, + ::CUDA_ERROR_ILLEGAL_STATE + + \sa + ::cuStreamBeginCapture, + ::cuStreamGetCaptureInfo, + + + + Returns the unique Id associated with the stream handle supplied + Returns in \p streamId the unique Id which is associated with the given stream handle. + The Id is unique for the life of the program for this instance of CUDA. + The stream handle \p hStream can refer to any of the following: + - a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + Passing an invalid handle will result in undefined behavior. + - any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + ::CU_STREAM_PER_THREAD.The runtime API equivalents of these are also accepted, + which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + + Handle to the stream to be queried + Pointer to store the Id of the stream + + Combines all graphics interop API calls @@ -97481,12 +119344,58 @@ Options + + + Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM + Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + + Returned maximum dynamic shared memory + Kernel function for which occupancy is calculated + Number of blocks to fit on SM + Size of the blocks + + + + Given the kernel function (\p func) and launch configuration + (\p config), return the maximum cluster size in \p* clusterSize. + The cluster dimensions in \p config are ignored. If func has a required + cluster size set (see::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p + clusterSize will reflect the required cluster size. + By default this function will always return a value that's portable on + future hardware. A higher value may be returned if the kernel function + allows non-portable cluster sizes. + This function will respect the compile time launch bounds. + + Returned maximum cluster size that can be launched for the given kernel function and launch configuration + Kernel function for which maximum cluster size is calculated + Launch configuration for the given kernel function + + + + + Given the kernel function (\p func) and launch configuration + (\p config), return the maximum number of clusters that could co-exist + on the target device in \p* numClusters. + If the function has required cluster size already set (see + ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size + from config must either be unspecified or match the required size. + Without required sizes, the cluster size must be specified in config, + else the function will return an error. + Note that various attributes of the kernel function may affect occupancy + calculation. Runtime environment may affect how the hardware schedules + the clusters, so the calculated occupancy is not guaranteed to be achievable. + + Returned maximum number of clusters that could co-exist on the target device + Kernel function for which maximum number of clusters are calculated + Launch configuration for the given kernel function + + - + Imports an external memory object Imports an externally allocated memory object and returns a handle to that in \p extMem_out. @@ -97495,7 +119404,7 @@ Memory import handle descriptor - + Maps a buffer onto an imported memory object Maps a buffer onto an imported memory object and returns a device pointer in \p devPtr. @@ -97504,7 +119413,7 @@ Handle to external memory object Buffer descriptor - + Maps a CUDA mipmapped array onto an external memory object Maps a CUDA mipmapped array onto an external object and returns a handle to it in \p mipmap. @@ -97522,7 +119431,7 @@ External memory object to be destroyed - + Imports an external semaphore Imports an externally allocated synchronization object and returns a handle to that in \p extSem_out. @@ -97532,7 +119441,7 @@ Semaphore import handle descriptor - + Signals a set of external semaphore objects Enqueues a signal operation on a set of externally allocated @@ -97547,7 +119456,7 @@ Number of semaphores to signal Stream to enqueue the signal operations in - + Waits on a set of external semaphore objects Enqueues a wait operation on a set of externally allocated @@ -97583,7 +119492,7 @@ Returns newly created graph Graph creation flags, must be 0 - + Creates a kernel execution node and adds it to a graph Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies @@ -97598,7 +119507,7 @@ Number of dependencies Parameters for the GPU execution node - + Returns a kernel node's parameters Returns the parameters of kernel node \p hNode in \p nodeParams. @@ -97614,7 +119523,7 @@ Node to get the parameters for Pointer to return the parameters - + Sets a kernel node's parameters Sets the parameters of kernel node \p hNode to \p nodeParams. @@ -97662,7 +119571,7 @@ Node to set the parameters for Parameters to copy - + Creates a memset node and adds it to a graph Creates a new memset node and adds it to \p hGraph with \p numDependencies @@ -97680,7 +119589,7 @@ Parameters for the memory set Context on which to run the node - + Returns a memset node's parameters Returns the parameters of memset node \p hNode in \p nodeParams. @@ -97688,7 +119597,7 @@ Node to get the parameters for Pointer to return the parameters - + Sets a memset node's parameters Sets the parameters of memset node \p hNode to \p nodeParams. @@ -97696,7 +119605,7 @@ Node to set the parameters for Parameters to copy - + Creates a host execution node and adds it to a graph Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies @@ -97712,7 +119621,7 @@ Number of dependencies Parameters for the host node - + Returns a host node's parameters Returns the parameters of host node \p hNode in \p nodeParams. @@ -97720,7 +119629,7 @@ Node to get the parameters for Pointer to return the parameters - + Sets a host node's parameters Sets the parameters of host node \p hNode to \p nodeParams. @@ -97750,6 +119659,7 @@ Gets a handle to the embedded graph in a child graph node. This call does not clone the graph. Changes to the graph will be reflected in the node, and the node retains ownership of the graph. + Allocation and free nodes cannot be added to the returned graph. Attempting to do so will return an error. Node to get the embedded graph for Location to store a handle to the graph @@ -97772,6 +119682,345 @@ Dependencies of the node Number of dependencies + + + Creates an event record node and adds it to a graph + Creates a new event record node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p params. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. + A handle to the new node will be returned in \p phGraphNode. + Each launch of the graph will record \p event to capture execution of the + node's dependencies. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Event for the node + + + + + Returns the event associated with an event record node + + Node to get the event for + Pointer to return the event + + + + + Sets an event record node's event + + Node to set the event for + Event to use + + + + + Creates an event wait node and adds it to a graph + Creates a new event wait node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p params. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. + A handle to the new node will be returned in \p phGraphNode. + The graph node will wait for all work captured in \p event. See ::cuEventRecord() + for details on what is captured by an event. \p event may be from a different context + or device than the launch stream. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Event for the node + + + + + Returns the event associated with an event wait node + + Node to get the event for + Pointer to return the event + + + + + Sets an event wait node's event + + Node to set the event for + Event to use + + + + + Creates an external semaphore signal node and adds it to a graph + Creates a new external semaphore signal node and adds it to \p hGraph with \p + numDependencies dependencies specified via \p dependencies and arguments specified + in \p nodeParams.It is possible for \p numDependencies to be 0, in which case the + node will be placed at the root of the graph. \p dependencies may not have any + duplicate entries. A handle to the new node will be returned in \p phGraphNode. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Parameters for the node + + + + + Returns an external semaphore signal node's parameters + Returns the parameters of an external semaphore signal node \p hNode in \p params_out. + The \p extSemArray and \p paramsArray returned in \p params_out, + are owned by the node.This memory remains valid until the node is destroyed or its + parameters are modified, and should not be modified + directly.Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the parameters of this node. + + Node to get the parameters for + Pointer to return the parameters + + + + + Sets an external semaphore signal node's parameters + Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. + + Node to set the parameters for + Parameters to copy + + + + + Creates an external semaphore wait node and adds it to a graph + Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Parameters for the node + + + + + Returns an external semaphore wait node's parameters + Returns the parameters of an external semaphore wait node \p hNode in \p params_out. + The \p extSemArray and \p paramsArray returned in \p params_out, + are owned by the node.This memory remains valid until the node is destroyed or its + parameters are modified, and should not be modified + directly.Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + parameters of this node. + + Node to get the parameters for + Pointer to return the parameters + + + + + Sets an external semaphore wait node's parameters + Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. + + Node to set the parameters for + Parameters to copy + + + + + + + Creates a batch memory operation node and adds it to a graph + Creates a new batch memory operation node and adds it to \p hGraph with \p + numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. + A handle to the new node will be returned in \p phGraphNode. + When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be + freed after the call returns. + Warning: + Improper use of this API may deadlock the application.Synchronization + ordering established through this API is not visible to CUDA. CUDA tasks + that are (even indirectly) ordered by this API should also have that order + expressed with CUDA-visible dependencies such as events.This ensures that + the scheduler does not serialize them in an improper order.For more + information, see the Stream Memory Operations section in the programming + guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Parameters for the node + + + + Returns a batch mem op node's parameters + Returns the parameters of batch mem op node \p hNode in \p nodeParams_out. + The \p paramArray returned in \p nodeParams_out is owned by the node. + This memory remains valid until the node is destroyed or its + parameters are modified, and should not be modified + directly. Use::cuGraphBatchMemOpNodeSetParams to update the + parameters of this node. + + Node to get the parameters for + Pointer to return the parameters + + + + Sets a batch mem op node's parameters + Sets the parameters of batch mem op node \p hNode to \p nodeParams. + The paramArray inside \p nodeParams is copied and therefore it can be freed after the call returns. + + Node to set the parameters for + Parameters to copy + + + + + Sets the parameters for a batch mem op node in the given graphExec + Sets the parameters of a batch mem op node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The following fields on operations may be modified on an executable graph: + op.waitValue.address + op.waitValue.value[64] + op.waitValue.flags bits corresponding to wait type (i.e.CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified) + op.writeValue.address + op.writeValue.value[64] + Other fields, such as the context, count or type of operations, and other types of operations such as membars, may not be modified. + \p hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + The paramArray inside \p nodeParams is copied and therefore it can be + freed after the call returns. + + The executable graph in which to set the specified node + Batch mem op node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + + Creates an allocation node and adds it to a graph + Creates a new allocation node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + When::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in + \param nodeParams.dptr.The allocation's address remains fixed across instantiations and launches. + If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, + the allocation can be accessed by nodes ordered after the allocation node but before the free node. + These allocations cannot be freed outside the owning graph, and they can only be freed once in the + owning graph. + If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the + graph which are ordered after the allocation node, but also by stream operations ordered after the + graph's execution but before the allocation is freed. + Allocations which are not freed in the same graph can be freed by: + - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; + - launching a graph with a free node for that allocation; or + - specifying::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes + each launch behave as though it called::cuMemFreeAsync for every unfreed allocation. + It is not possible to free an allocation in both the owning graph and another graph.If the allocation + is freed in the same graph, a free node cannot be added to another graph.If the allocation is freed + in another graph, a free node can no longer be added to the owning graph. + The following restrictions apply to graphs which contain allocation and/or memory free nodes: + - Nodes and edges of the graph cannot be deleted. + - The graph cannot be used in a child node. + - Only one instantiation of the graph may exist at any point in time. + - The graph cannot be cloned. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Parameters for the node + + + + + Returns a memory alloc node's parameters + Returns the parameters of a memory alloc node \p hNode in \p params_out. + The \p poolProps and \p accessDescs returned in \p params_out, are owned by the + node.This memory remains valid until the node is destroyed.The returned + parameters must not be modified. + + Node to get the parameters for + Pointer to return the parameters + + + + + Creates a memory free node and adds it to a graph + Creates a new memory free node and adds it to \p hGraph with \p numDependencies + dependencies specified via \p dependencies and arguments specified in \p nodeParams. + It is possible for \p numDependencies to be 0, in which case the node will be placed + at the root of the graph. \p dependencies may not have any duplicate entries. A handle + to the new node will be returned in \p phGraphNode. + ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: + - an allocation twice in the same graph. + - an address that was not returned by an allocation node. + - an invalid address. + The following restrictions apply to graphs which contain allocation and/or memory free nodes: + - Nodes and edges of the graph cannot be deleted. + - The graph cannot be used in a child node. + - Only one instantiation of the graph may exist at any point in time. + - The graph cannot be cloned. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Address of memory to free + + + + + Returns a memory free node's parameters + Returns the address of a memory free node \p hNode in \p dptr_out. + + Node to get the parameters for + Pointer to return the device address + + + + + Free unused memory that was cached on the specified device for use with graphs back to the OS. + Blocks which are not in use by a graph that is either currently executing or scheduled to execute are freed back to the operating system. + + The device for which cached memory should be freed. + + + + + Query asynchronous allocation attributes related to graphs + Valid attributes are: + - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs + - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the last time it was reset.High watermark can only be reset to zero. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. + + Specifies the scope of the query + attribute to get + retrieved value + + + + + Set asynchronous allocation attributes related to graphs + Valid attributes are: + - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the last time it was reset.High watermark can only be reset to zero. + - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator. + + Specifies the scope of the query + attribute to get + pointer to value to set + + Clones a graph @@ -97848,6 +120097,28 @@ See description + + + Returns a graph's dependency edges (12.3+) + Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding + indices in \p from, \p to and \p edgeData; that is, the node in \p to[i] has a + dependency on the node in \p from[i] with data \p edgeData[i]. \p from and \p to may + both be NULL, in which case this function only returns the number of edges in + \p numEdges.Otherwise, \p numEdges entries will be filled in. If \p numEdges is higher + than the actual number of edges, the remaining entries in \p from and \p to will be + set to NULL, and the number of edges actually returned will be written to \p numEdges. + \p edgeData may alone be NULL, in which case the edges must all have default (zeroed) + edge data.Attempting a lossy query via NULL \p edgeData will result in + ::CUDA_ERROR_LOSSY_QUERY.If \p edgeData is non-NULL then \p from and \p to must be + as well. + + Graph to get the edges from + Location to return edge endpoints + Location to return edge endpoints + Optional location to return edge data + See description + + Returns a node's dependencies @@ -97861,6 +120132,23 @@ Pointer to return the dependencies See description + + + Returns a node's dependencies (12.3+) + Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this + function will return the number of dependencies in \p numDependencies.Otherwise, + \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual + number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the + number of nodes actually obtained will be returned in \p numDependencies. + Note that if an edge has non-zero (non-default) edge data and \p edgeData is NULL, + this API will return ::CUDA_ERROR_LOSSY_QUERY.If \p edgeData is non-NULL, then + \p dependencies must be as well. + + Node to query + Pointer to return the dependencies + Optional array to return edge data for each dependency + See description + Returns a node's dependent nodes @@ -97875,6 +120163,24 @@ Pointer to return the dependent nodes See description + + + Returns a node's dependent nodes (12.3+) + Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which + case this function will return the number of dependent nodes in \p numDependentNodes. + Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is + higher than the actual number of dependent nodes, the remaining entries in + \p dependentNodes will be set to NULL, and the number of nodes actually obtained will + be returned in \p numDependentNodes. + Note that if an edge has non-zero(non-default) edge data and \p edgeData is NULL, + this API will return ::CUDA_ERROR_LOSSY_QUERY.If \p edgeData is non-NULL, then + \p dependentNodes must be as well. + + Node to query + Pointer to return the dependent nodes + Optional pointer to return edge data for dependent nodes + See description + Adds dependency edges to a graph @@ -97889,6 +120195,21 @@ Array of dependent nodes Number of dependencies to be added + + + Adds dependency edges to a graph (12.3+) + The number of dependencies to be added is defined by \p numDependencies + Elements in \p from and \p to at corresponding indices define a dependency. + Each node in \p from and \p to must belong to \p hGraph. + If \p numDependencies is 0, elements in \p from and \p to will be ignored. + Specifying an existing dependency will return an error. + + Graph to which dependencies are added + Array of nodes that provide the dependencies + Array of dependent nodes + Optional array of edge data. If NULL, default (zeroed) edge data is assumed. + Number of dependencies to be added + Removes dependency edges from a graph @@ -97896,38 +120217,349 @@ Elements in \p from and \p to at corresponding indices define a dependency. Each node in \p from and \p to must belong to \p hGraph. If \p numDependencies is 0, elements in \p from and \p to will be ignored. - Specifying a non-existing dependency will return an error. + Specifying a non-existing dependency will return an error. + Dependencies cannot be removed from graphs which contain allocation or free nodes. Any attempt to do so will return an error. + + Graph from which to remove dependencies + Array of nodes that provide the dependencies + Array of dependent nodes + Number of dependencies to be removed + + + + Removes dependency edges from a graph (12.3+) + The number of \p dependencies to be removed is defined by \p numDependencies. + Elements in \p from and \p to at corresponding indices define a dependency. + Each node in \p from and \p to must belong to \p hGraph. + \p numDependencies is 0, elements in \p from and \p to will be ignored. + Specifying an edge that does not exist in the graph, with data matching + \p edgeData, results in an error. \p edgeData is nullable, which is equivalent + to passing default (zeroed) data for each edge. + Dependencies cannot be removed from graphs which contain allocation or free nodes. + Any attempt to do so will return an error. Graph from which to remove dependencies Array of nodes that provide the dependencies Array of dependent nodes + Optional array of edge data. If NULL, edge data is assumed to be default (zeroed). Number of dependencies to be removed Remove a node from the graph - Removes \p hNode from its graph. This operation also severs any dependencies of other nodes on \p hNode and vice versa. + Removes \p hNode from its graph. This operation also severs any dependencies of other nodes on \p hNode and vice versa. + Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. Any attempt to do so will return an error. Node to remove - + Creates an executable graph from a graph Instantiates \p hGraph as an executable graph. The graph is validated for any structural constraints or intra-node constraints which were not previously validated.If instantiation is successful, a handle to the instantiated graph - is returned in \p graphExec. - If there are any errors, diagnostic information may be returned in \p errorNode and - \p logBuffer.This is the primary way to inspect instantiation errors.The output - will be null terminated unless the diagnostics overflow - the buffer. In this case, they will be truncated, and the last byte can be - inspected to determine if truncation occurred. + is returned in \p phGraphExec. + The \p flags parameter controls the behavior of instantiation and subsequent graph launches.Valid flags are: + - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a graph containing memory allocation nodes to automatically free any + unfreed memory allocations before the graph is relaunched. + If \p hGraph contains any allocation or free nodes, there can be at most one + executable graph in existence for that graph at a time. + An attempt to instantiate a second executable graph before destroying the first + with ::cuGraphExecDestroy will result in an error. Returns instantiated graph Graph to instantiate - In case of an instantiation error, this may be modified to indicate a node contributing to the error - A character buffer to store diagnostic messages - Size of the log buffer in bytes + Flags to control instantiation. See ::CUgraphInstantiate_flags. + + + + + Creates an executable graph from a graph + Instantiates \p hGraph as an executable graph according to the \p instantiateParams structure. + The graph is validated for any structural constraints or intra-node constraints + which were not previously validated.If instantiation is successful, a handle to + the instantiated graph is returned in \p phGraphExec. + \p instantiateParams controls the behavior of instantiation and subsequent + graph launches, as well as returning more detailed information in the event of an error. + ::CUDA_GRAPH_INSTANTIATE_PARAMS is defined as: + \code + typedef struct { + cuuint64_t flags; + CUstream hUploadStream; + CUgraphNode hErrNode_out; + CUgraphInstantiateResult result_out; + } CUDA_GRAPH_INSTANTIATE_PARAMS; + \endcode + The \p flags field controls the behavior of instantiation and subsequent + graph launches.Valid flags are: + - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a + graph containing memory allocation nodes to automatically free any + unfreed memory allocations before the graph is relaunched. + - ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD, which will perform an upload of the graph + into \p hUploadStream once the graph has been instantiated. + - ::CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH, which configures the graph for launch + from the device. If this flag is passed, the executable graph handle returned can be + used to launch the graph from both the host and device. This flag can only be used + on platforms which support unified addressing. This flag cannot be used in + conjunction with::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH. + - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph + to use the priorities from the per-node attributes rather than the priority + of the launch stream during execution. Note that priorities are only available + on kernel nodes, and are copied from stream priority during stream capture. + If \p hGraph contains any allocation or free nodes, there can be at most one + executable graph in existence for that graph at a time.An attempt to instantiate a + second executable graph before destroying the first with::cuGraphExecDestroy will + result in an error. + If \p hGraph contains kernels which call device-side cudaGraphLaunch() from multiple + contexts, this will result in an error. + + Graphs instantiated for launch on the device have additional restrictions which do not + apply to host graphs: + - The graph's nodes must reside on a single context. + - The graph can only contain kernel nodes, memcpy nodes, memset nodes, and child graph nodes. + Operation-specific restrictions are outlined below. + - Kernel nodes: + - Use of CUDA Dynamic Parallelism is not permitted. + - Cooperative launches are permitted as long as MPS is not in use. + - Memcpy nodes: + - Only copies involving device memory and/or pinned device-mapped host memory are permitted. + - Copies involving CUDA arrays are not permitted. + - Both operands must be accessible from the current context, and the current context must + match the context of other nodes in the graph. + In the event of an error, the \p result_out and \p hErrNode_out fields will contain more + information about the nature of the error.Possible error reporting includes: + - ::CUDA_GRAPH_INSTANTIATE_ERROR, if passed an invalid value or if an unexpected error occurred + which is described by the return value of the function. \p hErrNode_out will be set to NULL. + - ::CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE, if the graph structure is invalid. \p hErrNode_out + will be set to one of the offending nodes. + - ::CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED, if the graph is instantiated for device + launch but contains a node of an unsupported node type, or a node which performs unsupported + operations, such as use of CUDA dynamic parallelism within a kernel node. \p hErrNode_out will + be set to this node. + - ::CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED, if the graph is instantiated for device + launch but a node's context differs from that of another node. This error can also be returned + if a graph is not instantiated for device launch and it contains kernels which call device-side + cudaGraphLaunch() from multiple contexts. \p hErrNode_out will be set to this node. + + If instantiation is successful, \p result_out will be set to ::CUDA_GRAPH_INSTANTIATE_SUCCESS, + and \p hErrNode_out will be set to NULL. + + Returns instantiated graph + Graph to instantiate + Instantiation parameters + + + + + Query the instantiation flags of an executable graph + Returns the flags that were passed to instantiation for the given executable graph. + ::CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD will not be returned by this API as it does + not affect the resulting executable graph. + + The executable graph to query + Returns the instantiation flags + + + + + Sets the parameters for a kernel node in the given graphExec + Sets the parameters of a kernel node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph.The \p func field + of \p nodeParams cannot be modified and must match the original value. + All other values can be modified. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + The executable graph in which to set the specified node + kernel node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + Sets the parameters for a memcpy node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p copyParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The source and destination memory in \p copyParams must be allocated from the same + contexts as the original source and destination memory. Both the instantiation-time + memory operands and the memory operands in \p copyParams must be 1-dimensional. + Zero-length operations are not supported. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or + either the original or new memory operands are multidimensional. + + The executable graph in which to set the specified node + Memcpy node from the graph which was used to instantiate graphExec + The updated parameters to set + Context on which to run the node + + + + Sets the parameters for a memset node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p memsetParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The destination memory in \p memsetParams must be allocated from the same + contexts as the original destination memory. Both the instantiation-time + memory operand and the memory operand in \p memsetParams must be 1-dimensional. + Zero-length operations are not supported. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or + either the original or new memory operand are multidimensional. + + The executable graph in which to set the specified node + Memset node from the graph which was used to instantiate graphExec + The updated parameters to set + Context on which to run the node + + + + Sets the parameters for a host node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + contained \p nodeParams at instantiation. hNode must remain in the graph which was + used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. hNode is also + not modified by this call. + + The executable graph in which to set the specified node + Host node from the graph which was used to instantiate graphExec + The updated parameters to set + + + + Updates node parameters in the child graph node in the given graphExec. + Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained + in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. + \p hNode must remain in the graph which was used to instantiate \p hGraphExec. + Changed edges to and from \p hNode are ignored. + The modifications only affect future launches of \p hGraphExec. Already enqueued + or running launches of \p hGraphExec are not affected by this call. \p hNode is also + not modified by this call. + The topology of \p childGraph, as well as the node insertion order, must match that + of the graph contained in \p hNode. See::cuGraphExecUpdate() for a list of restrictions + on what can be updated in an instantiated graph.The update is recursive, so child graph + nodes contained within the top level child graph will also be updated. + + The executable graph in which to set the specified node + Host node from the graph which was used to instantiate graphExec + The graph supplying the updated parameters + + + + + Sets the event for an event record node in the given graphExec + Sets the event of an event record node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + The executable graph in which to set the specified node + event record node from the graph from which graphExec was instantiated + Updated event to use + + + + + Sets the event for an event record node in the given graphExec + Sets the event of an event record node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + + The executable graph in which to set the specified node + event wait node from the graph from which graphExec was instantiated + Updated event to use + + + + + Sets the parameters for an external semaphore signal node in the given graphExec + Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + Changing \p nodeParams->numExtSems is not supported. + + The executable graph in which to set the specified node + semaphore signal node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + + Sets the parameters for an external semaphore wait node in the given graphExec + Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + The node is identified by the corresponding node \p hNode in the + non-executable graph, from which the executable graph was instantiated. + hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + Changing \p nodeParams->numExtSems is not supported. + + The executable graph in which to set the specified node + semaphore wait node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + + Enables or disables the specified node in the given graphExec + Sets \p hNode to be either enabled or disabled.Disabled nodes are functionally equivalent + to empty nodes until they are reenabled.Existing node parameters are not affected by + disabling/enabling the node. + The node is identified by the corresponding node \p hNode in the non-executable + graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph. + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + \p hNode is also not modified by this call. + \note Currently only kernel, memset and memcpy nodes are supported. + + The executable graph in which to set the specified node + Node from the graph from which graphExec was instantiated + Node is enabled if != 0, otherwise the node is disabled + + + + + Query whether a node in the given graphExec is enabled + Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. + The node is identified by the corresponding node \p hNode in the non-executable + graph, from which the executable graph was instantiated. + \p hNode must not have been removed from the original graph. + \note Currently only kernel, memset and memcpy nodes are supported. + + The executable graph in which to set the specified node + Node from the graph from which graphExec was instantiated + Location to return the enabled status of the node + + + + + Uploads an executable graph in a stream + Uploads \p hGraphExec to the device in \p hStream without executing it.Uploads of + the same \p hGraphExec will be serialized.Each upload is ordered behind both any + previous work in \p hStream and any previous launches of \p hGraphExec. + + Executable graph to upload + Stream in which to upload the graph + @@ -97957,13 +120589,1194 @@ Graph to destroy - + + + Check whether an executable graph can be updated with a graph and perform the update if possible + Updates the node parameters in the instantiated graph specified by \p hGraphExec with the node parameters in a topologically identical graph specified by \p hGraph. + Limitations: + - Kernel nodes: + - The owning context of the function cannot change. + - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + to a function which uses CDP. + - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. + - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the + priority attribute cannot change.Equality is checked on the originally requested + priority values, before they are clamped to the device's supported range. + - If \p hGraphExec was not instantiated for device launch, a node whose function originally did not use device-side cudaGraphLaunch() cannot be updated to a function which uses + device-side cudaGraphLaunch() unless the node resides on the same context as nodes which contained such calls at instantiate-time.If no such calls were present at instantiation, + these updates cannot be performed at all. + - Memset and memcpy nodes: + - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + - The source/destination memory must be allocated from the same contexts as the original source/destination memory. + - Only 1D memsets can be changed. + - Additional memcpy node restrictions: + - Changing either the source or destination memory type(i.e.CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_ARRAY, etc.) is not supported. + - External semaphore wait nodes and record nodes: + - Changing the number of semaphores is not supported. + Note: The API may add further restrictions in future releases. The return code should always be checked. + cuGraphExecUpdate sets the result member of \p resultInfo to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under the following conditions: + - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case resultInfo->errorNode + is set to NULL. + - \p hGraph has more exit nodes than \p hGraph, in which case resultInfo->errorNode is set to one of the exit nodes in hGraph. + - A node in \p hGraph has a different number of dependencies than the node from \p hGraphExec it is paired with, + in which case resultInfo->errorNode is set to the node from \p hGraph. + - A node in \p hGraph has a dependency that does not match with the corresponding dependency of the paired node + from \p hGraphExec. resultInfo->errorNode will be set to the node from \p hGraph. resultInfo->errorFromNode + will be set to the mismatched dependency. The dependencies are paired based on edge order and a dependency + does not match when the nodes are already paired based on other edges examined in the graph. + cuGraphExecUpdate sets the result member of \p resultInfo to: + - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. + - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed + - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case + \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported + way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + If the update fails for a reason not listed above, the result member of \p resultInfo will be set + to CU_GRAPH_EXEC_UPDATE_ERROR.If the update succeeds, the result member will be set to CU_GRAPH_EXEC_UPDATE_SUCCESS. + cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.It returns + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included changes which violated constraints specific to instantiated graph update. + + The instantiated graph to be updated + The graph containing the updated parameters + the error info structure + + + - A CUDA function or CUDA kernel + Copies attributes from source node to destination node. + Copies attributes from source node \p src to destination node \p dst. Both node must have the same context. + Destination node + Source node - - + + + Queries node attribute. + Queries attribute \p attr from node \p hNode and stores it in corresponding member of \p value_out. + + + + + + + + + Sets node attribute. + Sets attribute \p attr on node \p hNode from corresponding attribute of value. + + + + + + + + + Write a DOT file describing graph structure + Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. + By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + \p flags can be specified to write more detailed information about each node type such as + parameter values, kernel attributes, node and function handles. + + The graph to create a DOT file from + The path to write the DOT file to + Flags from CUgraphDebugDot_flags for specifying which additional node information to write + + + + + Create a user object + Create a user object with the specified destructor callback and initial reference count. The initial references are owned by the caller. + Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they + are executed by a shared internal thread.Another thread may be signaled to perform such + actions, if it does not block forward progress of tasks scheduled through CUDA. + See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + + Location to return the user object handle + The pointer to pass to the destroy function + Callback to free the user object when it is no longer in use + The initial refcount to create the object with, typically 1. The initial references are owned by the calling thread. + Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, which is the only defined flag. This indicates that the destroy + callback cannot be waited on by any CUDA API.Users requiring synchronization of the callback should signal its completion manually. + + + + + Retain a reference to a user object + Retains new references to a user object. The new references are owned by the caller. + See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + + The object to retain + The number of references to retain, typically 1. Must be nonzero and not larger than INT_MAX. + + + + + Release a reference to a user object + Releases user object references owned by the caller. The object's destructor is invoked if the reference count reaches zero. + It is undefined behavior to release references not owned by the caller, or to use a user object handle after all references are released. + See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + + The object to release + The number of references to release, typically 1. Must be nonzero and not larger than INT_MAX. + + + + + Retain a reference to a user object from a graph + Creates or moves user object references that will be owned by a CUDA graph. + See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + + The graph to associate the reference with + The user object to retain a reference for + The number of references to add to the graph, typically 1. Must be nonzero and not larger than INT_MAX. + The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references from the calling thread, rather than create new references.Pass None to create new references. + + + + + Release a user object reference from a graph + Releases user object references owned by a graph. + See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + + The graph that will release the reference + The user object to release a reference for + The number of references to release, typically 1. Must be nonzero and not larger than INT_MAX. + + + + + Adds a node of arbitrary type to a graph + + Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies + dependencies specified via \p dependencies. \p numDependencies may be 0. + \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have + any duplicate entries. + + \p nodeParams is a tagged union. The node type should be specified in the \p type field, + and type-specific parameters in the corresponding union member. All unused bytes - that + is, \p reserved0 and all bytes past the utilized union member - must be set to zero. + It is recommended to use brace initialization or memset to ensure all bytes are + initialized. + + Note that for some node types, \p nodeParams may contain "out parameters" which are + modified during the call, such as \p nodeParams->alloc.dptr. + + A handle to the new node will be returned in \p phGraphNode. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Number of dependencies + Specification of the node + + + + + Adds a node of arbitrary type to a graph (12.3+) + Creates a new node in \p hGraph described by \p nodeParams with \p numDependencies + dependencies specified via \p dependencies. \p numDependencies may be 0. + \p dependencies may be null if \p numDependencies is 0. \p dependencies may not have + any duplicate entries. + \p nodeParams is a tagged union.The node type should be specified in the \p type field, + and type-specific parameters in the corresponding union member. All unused bytes - that + is, \p reserved0 and all bytes past the utilized union member - must be set to zero. + It is recommended to use brace initialization or memset to ensure all bytes are + initialized. + Note that for some node types, \p nodeParams may contain "out parameters" which are + modified during the call, such as \p nodeParams->alloc.dptr. + A handle to the new node will be returned in \p phGraphNode. + + Returns newly created node + Graph to which to add the node + Dependencies of the node + Optional edge data for the dependencies. If NULL, the data is assumed to be default (zeroed) for all dependencies. + Number of dependencies + Specification of the node + + + + Update's a graph node's parameters + Sets the parameters of graph node \p hNode to \p nodeParams.The node type specified by + \p nodeParams->type must match the type of \p hNode. \p nodeParams must be fully + initialized and all unused bytes (reserved, padding) zeroed. + Modifying parameters is not supported for node types CU_GRAPH_NODE_TYPE_MEM_ALLOC and + CU_GRAPH_NODE_TYPE_MEM_FREE. + + Node to set the parameters for + Parameters to copy + + + + + Update's a graph node's parameters in an instantiated graph + Sets the parameters of a node in an executable graph \p hGraphExec.The node is identified + by the corresponding node \p hNode in the non-executable graph from which the executable + graph was instantiated. \p hNode must not have been removed from the original graph. + + The modifications only affect future launches of \p hGraphExec. Already + enqueued or running launches of \p hGraphExec are not affected by this call. + hNode is also not modified by this call. + + Allowed changes to parameters on executable graphs are as follows: + Node type | Allowed changes + kernel | See ::cuGraphExecKernelNodeSetParams + memcpy | Addresses for 1-dimensional copies if allocated in same context; see::cuGraphExecMemcpyNodeSetParams + memset | Addresses for 1-dimensional memsets if allocated in same context; see::cuGraphExecMemsetNodeSetParams + host | Unrestricted + child graph | Topology must match and restrictions apply recursively; see::cuGraphExecUpdate + event wait | Unrestricted + event record | Unrestricted + external semaphore signal | Number of semaphore operations cannot change + external semaphore wait | Number of semaphore operations cannot change + memory allocation | API unsupported + memory free | API unsupported + batch memops | Addresses, values, and operation type for wait operations; see::cuGraphExecBatchMemOpNodeSetParams + + The executable graph in which to update the specified node + Corresponding node from the graph from which graphExec was instantiated + Updated Parameters to set + + + + + Create a conditional handle + Creates a conditional handle associated with \p hGraph. + The conditional handle must be associated with a conditional node in this graph or one of its children. + Handles not associated with a conditional node may cause graph instantiation to fail. + Handles can only be set from the context with which they are associated. + + Pointer used to return the handle to the caller. + Graph which will contain the conditional node using this handle. + Context for the handle and associated conditional node. + Optional initial value for the conditional variable. + Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0. + + + + + + + + + Blocks until remote writes are visible to the specified scope + Blocks until GPUDirect RDMA writes to the target context via mappings + created through APIs like nvidia_p2p_get_pages(see + https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are + visible to the specified scope. + + If the scope equals or lies within the scope indicated by + ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call + will be a no-op and can be safely omitted for performance.This can be + determined by comparing the numerical values between the two enums, with + smaller scopes having smaller values. + Users may query support for this API via ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. + + The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget + The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope + + + + This section describes the tensor core management functions of the + low-level CUDA driver application programming interface. The tensor + core API is only supported on devices of compute capability 9.0 or higher. + + + + + Create a tensor map descriptor object representing tiled memory region + Creates a descriptor for Tensor Memory Access(TMA) object specified by the parameters describing a tiled region and returns it in \p tensorMap. + Tensor map objects are only supported on devices of compute capability 9.0 or higher. + Additionally, a tensor map object is an opaque value, and, as such, should only be accessed through CUDA API calls. + + Tensor map object to create + Tensor data type + Dimensionality of tensor + Starting address of memory region described by tensor + Array containing tensor size (number of elements) along each of the \p tensorRank dimensions + Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions + Array containing traversal box size (number of elments) along each of the \p tensorRank dimensions. Specifies how many elements to be traversed along each tensor dimension. + Array containing traversal stride in each of the \p tensorRank dimensions + Type of interleaved layout the tensor addresses + Bank swizzling pattern inside shared memory + L2 promotion size + Indicate whether zero or special NaN constant must be used to fill out-of-bound elements + + + + + Create a tensor map descriptor object representing im2col memory region + Creates a descriptor for Tensor Memory Access (TMA) object specified + by the parameters describing a im2col memory layout and returns it in \p tensorMap. + Tensor map objects are only supported on devices of compute capability 9.0 or higher. + Additionally, a tensor map object is an opaque value, and, as such, should only be + accessed through CUDA API calls. + + Tensor map object to create + Tensor data type + Dimensionality of tensor, needs to be at least of dimension 3 + Starting address of memory region described by tensor + Array containing tensor size (number of elements) along each of the \p tensorRank dimensions + Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions + Array containing DHW dimentions of lower box corner + Array containing DHW dimentions of upper box corner + Number of channels per pixel + Number of pixels per column + Array containing traversal stride in each of the \p tensorRank dimensions + Type of interleaved layout the tensor addresses + Bank swizzling pattern inside shared memory + L2 promotion size + Indicate whether zero or special NaN constant must be used to fill out-of-bound elements + + + + + Create a tensor map descriptor object representing im2col memory region, but where + the elements are exclusively loaded along the W dimension. + Creates a descriptor for Tensor Memory Access(TMA) object specified by the parameters + describing a im2col memory layout and where the row is always loaded along the W dimension + and returns it in \p tensorMap. This assumes the tensor layout in memory is either NDHWC, + NHWC, or NWC. + This API is only supported on devices of compute capability 10.0 or higher. + Additionally, a tensor map object is an opaque value, and, as such, should only be + accessed through CUDA APIs and PTX. + Note that::CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can only be used when \p tensorDataType represents a floating-point data type, + and when \p tensorDataType is not::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B, ::CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B, and::CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B. + + Tensor map object to create + Tensor data type + Dimensionality of tensor; must be at least 3 + Starting address of memory region described by tensor + Array containing tensor size (number of elements) along each of the \p tensorRank dimensions + Array containing stride size (in bytes) along each of the \p tensorRank - 1 dimensions + Width offset of left box corner + Width offset of right box corner + Number of channels per pixel + Number of pixels per column + Array containing traversal stride in each of the \p tensorRank dimensions + Type of interleaved layout the tensor addresses + W or W128 mode + Bank swizzling pattern inside shared memory + L2 promotion size + Indicate whether zero or special NaN constant will be used to fill out-of-bound elements + + + + + Modify an existing tensor map descriptor with an updated global address + Modifies the descriptor for Tensor Memory Access (TMA) object passed in \p tensorMap with an updated \p globalAddress. + Tensor map objects are only supported on devices of compute capability 9.0 or higher. + Additionally, a tensor map object is an opaque value, and, as such, should only be + accessed through CUDA API calls. + + Tensor map object to modify + Starting address of memory region described by tensor, must follow previous alignment requirements + + + + + This section describes the CUDA multicast object operations exposed by the low-level CUDA driver application programming interface. + + + + + Create a generic allocation handle representing a multicast object described by the given properties. + This creates a multicast object as described by \p prop. The number of + participating devices is specified by::CUmulticastObjectProp::numDevices. + Devices can be added to the multicast object via ::cuMulticastAddDevice. + All participating devices must be added to the multicast object before memory + can be bound to it. Memory is bound to the multicast object via either + ::cuMulticastBindMem or ::cuMulticastBindAddr, and can be unbound via + ::cuMulticastUnbind.The total amount of memory that can be bound per device + is specified by :CUmulticastObjectProp::size.This size must be a multiple of + the value returned by::cuMulticastGetGranularity with the flag + ::CU_MULTICAST_GRANULARITY_MINIMUM.For best performance however, the size + should be aligned to the value returned by ::cuMulticastGetGranularity with + the flag ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + + After all participating devices have been added, multicast objects can also + be mapped to a device's virtual address space using the virtual memory + management APIs (see::cuMemMap and ::cuMemSetAccess). Multicast objects can + also be shared with other processes by requesting a shareable handle via + ::cuMemExportToShareableHandle.Note that the desired types of shareable + handles must be specified in the bitmask::CUmulticastObjectProp::handleTypes. + Multicast objects can be released using the virtual memory management API + ::cuMemRelease. + + Value of handle returned. + Properties of the multicast object to create. + + + + + Associate a device to a multicast object. + Associates a device to a multicast object. The added device will be a part of + the multicast team of size specified by CUmulticastObjectProp::numDevices + during::cuMulticastCreate. + The association of the device to the multicast object is permanent during + the life time of the multicast object. + All devices must be added to the multicast team before any memory can be + bound to any device in the team. Any calls to::cuMulticastBindMem or + ::cuMulticastBindAddr will block until all devices have been added. + Similarly all devices must be added to the multicast team before a virtual + address range can be mapped to the multicast object. A call to::cuMemMap + will block until all devices have been added. + + Handle representing a multicast object. + Device that will be associated to the multicast object. + + + + + Bind a memory allocation represented by a handle to a multicast object. + Binds a memory allocation specified by \p memHandle and created via + ::cuMemCreate to a multicast object represented by \p mcHandle and created + via::cuMulticastCreate.The intended \p size of the bind, the offset in the + multicast range \p mcOffset as well as the offset in the memory \p memOffset + must be a multiple of the value returned by::cuMulticastGetGranularity with + the flag::CU_MULTICAST_GRANULARITY_MINIMUM.For best performance however, + \p size, \p mcOffset and \p memOffset should be aligned to the granularity of + the memory allocation(see::cuMemGetAllocationGranularity) or to the value + returned by::cuMulticastGetGranularity with the flag + ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + The \p size + \p memOffset must be smaller than the size of the allocated + memory. Similarly the \p size + \p mcOffset must be smaller than the size + of the multicast object. + The memory allocation must have beeen created on one of the devices + that was added to the multicast team via ::cuMulticastAddDevice. + Externally shareable as well as imported multicast objects can be bound only + to externally shareable memory. + Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are + insufficient resources required to perform the bind.This call may also + return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not + initialized or running. + + Handle representing a multicast object. + Offset into the multicast object for attachment. + Handle representing a memory allocation. + Offset into the memory for attachment. + Size of the memory that will be bound to the multicast object. + Flags for future use, must be zero for now. + + + + + Bind a memory allocation represented by a virtual address to a multicast object. + Binds a memory allocation specified by its mapped address \p memptr to a + multicast object represented by \p mcHandle. + The memory must have been allocated via::cuMemCreate or ::cudaMallocAsync. + The intended \p size of the bind, the offset in the multicast range + \p mcOffset and \p memptr must be a multiple of the value returned by + ::cuMulticastGetGranularity with the flag ::CU_MULTICAST_GRANULARITY_MINIMUM. + For best performance however, \p size, \p mcOffset and \p memptr should be + aligned to the value returned by ::cuMulticastGetGranularity with the flag + ::CU_MULTICAST_GRANULARITY_RECOMMENDED. + The \p size must be smaller than the size of the allocated memory. + Similarly the \p size + \p mcOffset must be smaller than the total size + of the multicast object. + The memory allocation must have beeen created on one of the devices + that was added to the multicast team via ::cuMulticastAddDevice. + Externally shareable as well as imported multicast objects can be bound only + to externally shareable memory. + Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if there are + insufficient resources required to perform the bind.This call may also + return CUDA_ERROR_SYSTEM_NOT_READY if the necessary system software is not + initialized or running. + + Handle representing a multicast object. + Offset into multicast va range for attachment. + Virtual address of the memory allocation. + Size of memory that will be bound to the multicast object. + Flags for future use, must be zero now. + + + + + Unbind any memory allocations bound to a multicast object at a given offset and upto a given size. + Unbinds any memory allocations hosted on \p dev and bound to a multicast + object at \p mcOffset and upto a given \p size. + The intended \p size of the unbind and the offset in the multicast range + ( \p mcOffset) must be a multiple of the value returned by + ::cuMulticastGetGranularity flag::CU_MULTICAST_GRANULARITY_MINIMUM. + The \p size + \p mcOffset must be smaller than the total size of the multicast object. + Warning: + The \p mcOffset and the \p size must match the corresponding values specified + during the bind call. Any other values may result in undefined behavior. + + Handle representing a multicast object. + Device that hosts the memory allocation. + Offset into the multicast object. + Desired size to unbind. + + + + + Calculates either the minimal or recommended granularity for multicast object + Calculates either the minimal or recommended granularity for a given set of + multicast object properties and returns it in granularity.This granularity + can be used as a multiple for size, bind offsets and address mappings of the + multicast object. + + Returned granularity. + Properties of the multicast object. + Determines which granularity to return. + + + + + This section describes the coredump attribute control functions of the low-level CUDA driver application programming interface. + + + + + Allows caller to fetch a coredump attribute value for the current context + Returns in \p *value the requested value specified by \p attrib. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p attrib will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false unless set to::true globally or locally, or the + CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true unless set to::false globally or + or locally. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false unless set to::true globally or locally. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false unless set to::true globally or locally. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. The default value is + ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to fetch. + void* containing the requested data. + + + + + Allows caller to fetch a coredump attribute value for the current context + Returns in \p *value the requested value specified by \p attrib. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p attrib will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false unless set to::true globally or locally, or the + CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true unless set to::false globally or + or locally. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false unless set to::true globally or locally. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false unless set to::true globally or locally. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. The default value is + ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to fetch. + void* containing the requested data. + + + + + Allows caller to fetch a coredump attribute value for the entire application + Returns in \p* value the requested value specified by \p attrib.It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p attrib will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is ::false. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. The default value is + ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to fetch. + void* containing the requested data. + + + + + Allows caller to fetch a coredump attribute value for the entire application + Returns in \p* value the requested value specified by \p attrib.It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p attrib will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is ::false. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. The default value is + ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to fetch. + void* containing the requested data. + + + + + Allows caller to set a coredump attribute value for the current context + This function should be considered an alternate interface to the CUDA-GDB environment + variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + An important design decision to note is that any coredump environment variable values + set before CUDA initializes will take permanent precedence over any values set with this + this function.This decision was made to ensure no change in behavior for any users that + may be currently using these variables to get coredumps. + \p* value shall contain the requested value specified by \p set. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p set will be placed in \p size. + This function will return ::CUDA_ERROR_NOT_SUPPORTED if the caller attempts to set + ::CU_COREDUMP_ENABLE_ON_EXCEPTION on a GPU of with Compute Capability < 6.0. ::cuCoredumpSetAttributeGlobal + works on those platforms as an alternative. + ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + + The enum defining which value to set. + void* containing the requested data. + + + + Allows caller to set a coredump attribute value for the current context + This function should be considered an alternate interface to the CUDA-GDB environment + variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + An important design decision to note is that any coredump environment variable values + set before CUDA initializes will take permanent precedence over any values set with this + this function.This decision was made to ensure no change in behavior for any users that + may be currently using these variables to get coredumps. + \p* value shall contain the requested value specified by \p set. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p set will be placed in \p size. + This function will return ::CUDA_ERROR_NOT_SUPPORTED if the caller attempts to set + ::CU_COREDUMP_ENABLE_ON_EXCEPTION on a GPU of with Compute Capability < 6.0. ::cuCoredumpSetAttributeGlobal + works on those platforms as an alternative. + ::CU_COREDUMP_ENABLE_USER_TRIGGER and ::CU_COREDUMP_PIPE cannot be set on a per-context basis. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + + The enum defining which value to set. + void* containing the requested data. + + + + Allows caller to set a coredump attribute value globally + This function should be considered an alternate interface to the CUDA-GDB environment + variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + An important design decision to note is that any coredump environment variable values + set before CUDA initializes will take permanent precedence over any values set with this + this function.This decision was made to ensure no change in behavior for any users that + may be currently using these variables to get coredumps. + \p* value shall contain the requested value specified by \p set. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p set will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. This value may not be + changed after::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default + value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine + running the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to set. + void* containing the requested data. + + + + Allows caller to set a coredump attribute value globally + This function should be considered an alternate interface to the CUDA-GDB environment + variables defined in this document: https://docs.nvidia.com/cuda/cuda-gdb/index.html#gpu-coredump + An important design decision to note is that any coredump environment variable values + set before CUDA initializes will take permanent precedence over any values set with this + this function.This decision was made to ensure no change in behavior for any users that + may be currently using these variables to get coredumps. + \p* value shall contain the requested value specified by \p set. It is up to the caller + to ensure that the data type and size of \p* value matches the request. + If the caller calls this function with \p* value equal to NULL, the size of the memory + region (in bytes) expected for \p set will be placed in \p size. + The supported attributes are: + - ::CU_COREDUMP_ENABLE_ON_EXCEPTION: Bool where ::true means that GPU exceptions from + this context will create a coredump at the location specified by ::CU_COREDUMP_FILE. + The default value is ::false. + - ::CU_COREDUMP_TRIGGER_HOST: Bool where ::true means that the host CPU will + also create a coredump. The default value is ::true. + - ::CU_COREDUMP_LIGHTWEIGHT: Bool where ::true means that any resulting coredumps + will not have a dump of GPU memory or non-reloc ELF images.The default value is + ::false. + - ::CU_COREDUMP_ENABLE_USER_TRIGGER: Bool where ::true means that a coredump can be + created by writing to the system pipe specified by::CU_COREDUMP_PIPE.The default + value is ::false. + - ::CU_COREDUMP_FILE: String of up to 1023 characters that defines the location where + any coredumps generated by this context will be written. The default value is + ::core.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine running + the CUDA applications and::PID is the process ID of the CUDA application. + - ::CU_COREDUMP_PIPE: String of up to 1023 characters that defines the name of the pipe + that will be monitored if user-triggered coredumps are enabled. This value may not be + changed after::CU_COREDUMP_ENABLE_USER_TRIGGER is set to ::true. The default + value is ::corepipe.cuda.HOSTNAME.PID where ::HOSTNAME is the host name of the machine + running the CUDA application and::PID is the process ID of the CUDA application. + + The enum defining which value to set. + void* containing the requested data. + + + + Driver level API for creation and manipulation of green contexts + + + + + Creates a green context with a specified set of resources. + This API creates a green context with the resources specified in the descriptor \p desc and + returns it in the handle represented by \p phCtx.This API will retain the primary context on device \p dev, + which will is released when the green context is destroyed.It is advised to have the primary context active + before calling this API to avoid the heavy cost of triggering primary context initialization and + deinitialization multiple times. + The API does not set the green context current. In order to set it current, you need to explicitly set it current + by first converting the green context to a CUcontext using ::cuCtxFromGreenCtx and subsequently calling + ::cuCtxSetCurrent / ::cuCtxPushCurrent.It should be noted that a green context can be current to only one + thread at a time.There is no internal synchronization to make API calls accessing the same green context + from multiple threads work. + Note: The API is not supported on 32-bit platforms. + + Pointer for the output handle to the green context + Descriptor generated via ::cuDevResourceGenerateDesc which contains the set of resources to be used + Device on which to create the green context. + One of the supported green context creation flags. \p CU_GREEN_CTX_DEFAULT_STREAM is required. + + + + Destroys a green context + Destroys the green context, releasing the primary context of the device that this green context was created for. + Any resources provisioned for this green context (that were initially available via the resource descriptor) + are released as well. + + Green context to be destroyed + + + + Converts a green context into the primary context + The API converts a green context into the primary context returned in \p pContext. It is important + to note that the converted context \p pContext is a normal primary context but with + the resources of the specified green context \p hCtx.Once converted, it can then + be used to set the context current with::cuCtxSetCurrent or with any of the CUDA APIs + that accept a CUcontext parameter. + Users are expected to call this API before calling any CUDA APIs that accept a + CUcontext. Failing to do so will result in the APIs returning::CUDA_ERROR_INVALID_CONTEXT. + + Returned primary context with green context resources + Green context to convert + + + + Get device resources + Get the \p type resources available to the \p device. + This may often be the starting point for further partitioning or configuring of resources. + Note: The API is not supported on 32-bit platforms. + + Device to get resource for + Output pointer to a CUdevResource structure + Type of resource to retrieve + + + + Get context resources + Get the \p type resources available to the context represented by \p hCtx + Note: The API is not supported on 32-bit platforms. + + Context to get resource for + Output pointer to a CUdevResource structure + Type of resource to retrieve + + + + + Get green context resources - Get the \p type resources available to the green context represented by \p hCtx + + Green context to get resource for + Output pointer to a CUdevResource structure + Type of resource to retrieve + + + + Splits \p CU_DEV_RESOURCE_TYPE_SM resources. + Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount + and the usage flags in \p useFlags.If \p result is NULL, the API simulates a split and provides the amount of groups that + would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return, + the API will overwrite \p nbGroups with the amount actually created.The groups are written to the array in \p result. + \p nbGroups can be less than the total amount if a smaller number of groups is needed. + This API is used to spatially partition the input resource.The input resource needs to come from one of + ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or::cuGreenCtxGetDevResource. + A limitation of the API is that the output results cannot be split again without + first creating a descriptor and a green context with that descriptor. + + When creating the groups, the API will take into account the performance and functional characteristics of the + input resource, and guarantee a split that will create a disjoint set of symmetrical partitions.This may lead to less groups created + than purely dividing the total SM count by the \p minCount due to cluster requirements or + alignment and granularity requirements for the minCount. + + The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result. + Its use should be carefully planned and future partitions of the \p remainder set are discouraged. + + A successful API call must either have: + - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional. + - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. + This queries the number of groups that would be created by the API. + + Note: The API is not supported on 32-bit platforms. + + Output array of \p CUdevResource resources. Can be NULL to query the number of groups. + This is a pointer, specifying the number of groups that would be or should be created as described below. + Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource. + If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here. Can be ommitted(NULL) if the user does not need the remaining set. + Flags specifying how these partitions are used or which constraints to abide by when splitting the input. + Minimum number of SMs required + + + + Splits \p CU_DEV_RESOURCE_TYPE_SM resources. + Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount + and the usage flags in \p useFlags.If \p result is NULL, the API simulates a split and provides the amount of groups that + would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return, + the API will overwrite \p nbGroups with the amount actually created.The groups are written to the array in \p result. + \p nbGroups can be less than the total amount if a smaller number of groups is needed. + This API is used to spatially partition the input resource.The input resource needs to come from one of + ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or::cuGreenCtxGetDevResource. + A limitation of the API is that the output results cannot be split again without + first creating a descriptor and a green context with that descriptor. + + When creating the groups, the API will take into account the performance and functional characteristics of the + input resource, and guarantee a split that will create a disjoint set of symmetrical partitions.This may lead to less groups created + than purely dividing the total SM count by the \p minCount due to cluster requirements or + alignment and granularity requirements for the minCount. + + The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result. + Its use should be carefully planned and future partitions of the \p remainder set are discouraged. + + A successful API call must either have: + - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional. + - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. + This queries the number of groups that would be created by the API. + + Note: The API is not supported on 32-bit platforms. + + Output array of \p CUdevResource resources. Can be NULL to query the number of groups. + This is a pointer, specifying the number of groups that would be or should be created as described below. + Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource. + If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here. Can be ommitted(NULL) if the user does not need the remaining set. + Flags specifying how these partitions are used or which constraints to abide by when splitting the input. + Minimum number of SMs required + + + + Splits \p CU_DEV_RESOURCE_TYPE_SM resources. + Splits \p CU_DEV_RESOURCE_TYPE_SM resources into \p nbGroups, adhering to the minimum SM count specified in \p minCount + and the usage flags in \p useFlags.If \p result is NULL, the API simulates a split and provides the amount of groups that + would be created in \p nbGroups. Otherwise, \p nbGroups must point to the amount of elements in \p result and on return, + the API will overwrite \p nbGroups with the amount actually created.The groups are written to the array in \p result. + \p nbGroups can be less than the total amount if a smaller number of groups is needed. + This API is used to spatially partition the input resource.The input resource needs to come from one of + ::cuDeviceGetDevResource, ::cuCtxGetDevResource, or::cuGreenCtxGetDevResource. + A limitation of the API is that the output results cannot be split again without + first creating a descriptor and a green context with that descriptor. + + When creating the groups, the API will take into account the performance and functional characteristics of the + input resource, and guarantee a split that will create a disjoint set of symmetrical partitions.This may lead to less groups created + than purely dividing the total SM count by the \p minCount due to cluster requirements or + alignment and granularity requirements for the minCount. + + The \p remainder set, might not have the same functional or performance guarantees as the groups in \p result. + Its use should be carefully planned and future partitions of the \p remainder set are discouraged. + + A successful API call must either have: + - A valid array of \p result pointers of size passed in \p nbGroups, with \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. \p remaining and \p useFlags are optional. + - NULL passed in for \p result, with a valid integer pointer in \p nbGroups and \p Input of type \p CU_DEV_RESOURCE_TYPE_SM. + Value of \p minCount must be between 0 and the SM count specified in \p input. + This queries the number of groups that would be created by the API. + + Note: The API is not supported on 32-bit platforms. + + Output array of \p CUdevResource resources. Can be NULL to query the number of groups. + This is a pointer, specifying the number of groups that would be or should be created as described below. + Input SM resource to be split. Must be a valid \p CU_DEV_RESOURCE_TYPE_SM resource. + If the input resource cannot be cleanly split among \p nbGroups, the remaining is placed in here. Can be ommitted(NULL) if the user does not need the remaining set. + Flags specifying how these partitions are used or which constraints to abide by when splitting the input. + Minimum number of SMs required + + + + Generate a resource descriptor + + Generates a resource descriptor with the set of resources specified in \p resources. + The generated resource descriptor is necessary for the creation of green contexts via the ::cuGreenCtxCreate API. + The API expects \p nbResources == 1, as there is only one type of resource and merging the same + types of resource is currently not supported. + + Note: The API is not supported on 32-bit platforms. + + Output descriptor + Array of resources to be included in the descriptor + Number of resources passed in \p resources + + + + + Records an event. + + Captures in \phEvent all the activities of the green context of \phCtx + at the time of this call. \phEvent and \phCtx must be from the same + CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will + then examine or wait for completion of the work that was captured. Uses of + \p hCtx after this call do not modify \p hEvent. + + \note The API will return an error if the specified green context \p hCtx + has a stream in the capture mode. In such a case, the call will invalidate + all the conflicting captures. + + Green context to record event for + Event to record + + + + Make a green context wait on an event + + Makes all future work submitted to green context \phCtx wait for all work + captured in \phEvent. The synchronization will be performed on the device + and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent() + for details on what is captured by an event. + + \note The API will return an error and invalidate the capture if the specified + event \p hEvent is part of an ongoing capture sequence. + + Green context to wait + Event to wait on (may not be NULL) + + + + Query the green context associated with a stream + + Returns the CUDA green context that the stream is associated with, or NULL if the stream + is not associated with any green context. + + The stream handle \p hStream can refer to any of the following: + + - a stream created via any of the CUDA driver APIs such as ::cuStreamCreate. + If during stream creation the context that was active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned in \p phCtx. + Otherwise, \p *phCtx is set to NULL instead. + + - special stream such as the NULL stream or ::CU_STREAM_LEGACY. + In that case if context that is active in the calling thread was obtained + with cuCtxFromGreenCtx, that green context is returned. + Otherwise, \p *phCtx is set to NULL instead. + + Passing an invalid handle will result in undefined behavior. + + Handle to the stream to be queried + Returned green context associated with the stream + + + + Create a stream for use in the green context + + Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream. + The stream can be destroyed by calling::cuStreamDestroy(). Note that the API ignores the context that + is current to the calling thread and creates a stream in the specified green context \p greenCtx. + + The supported values for \p flags are: + - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created + stream may run concurrently with work in the default stream, and that + the created stream should perform no implicit synchronization with the default stream. + + Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a + hint to preferentially run work with higher priority when possible, but do not preempt + already-running work or provide any other functional guarantee on execution order. + \p priority follows a convention where lower numbers represent higher priorities. + '0' represents default priority.The range of meaningful numerical priorities can + be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + outside the numerical range returned by::cuCtxGetStreamPriorityRange, + it will automatically be clamped to the lowest or the highest number in the range. + + Returned newly created stream + Green context for which to create the stream for + Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified. + Stream priority. Lower numbers represent higher priorities. See::cuCtxGetStreamPriorityRange for more information about meaningful stream priorities that can be passed. + + + + + CUDA checkpoint and restore functionality of the low-level + CUDA driver API + This sections describes the checkpoint and restore functions of the low-level + CUDA driver application programming interface. + The CUDA checkpoint and restore API's provide a way to save and restore GPU + state for full process checkpoints when used with CPU side process + checkpointing solutions.They can also be used to pause GPU work and suspend + a CUDA process to allow other applications to make use of GPU resources. + Checkpoint and restore capabilities are currently restricted to Linux. + + + + + Returns the restore thread ID for a CUDA process + Returns in \p* tid the thread ID of the CUDA restore thread for the process + specified by \p pid. + + The process ID of the CUDA process + Returned restore thread ID + + + + + Returns the process state of a CUDA process + Returns in \p *state the current state of the CUDA process specified by \p pid. + + The process ID of the CUDA process + Returned CUDA process state + + + + + Lock a running CUDA process + Lock the CUDA process specified by \p pid which will block further CUDA API + calls.Process must be in the RUNNING state in order to lock. + Upon successful return the process will be in the LOCKED state. + If timeoutMs is specified and the timeout is reached the process will be left + in the RUNNING state upon return. + + The process ID of the CUDA process + Optional lock operation arguments + + + + + Checkpoint a CUDA process's GPU memory contents + Checkpoints a CUDA process specified by \p pid that is in the LOCKED + state.The GPU memory contents will be brought into host memory and all + underlying references will be released.Process must be in the LOCKED state + to checkpoint. + Upon successful return the process will be in the CHECKPOINTED state. + + The process ID of the CUDA process + Optional checkpoint operation arguments + + + + + Restore a CUDA process's GPU memory contents from its last checkpoint + Restores a CUDA process specified by \p pid from its last checkpoint.Process + must be in the CHECKPOINTED state to restore. + Upon successful return the process will be in the LOCKED state. + CUDA process restore requires persistence mode to be enabled or::cuInit to + have been called before execution. + + The process ID of the CUDA process + Optional restore operation arguments + + + + + Unlock a CUDA process to allow CUDA API calls + Unlocks a process specified by \p pid allowing it to resume making CUDA API + calls.Process must be in the LOCKED state. + Upon successful return the process will be in the RUNNING state. + + The process ID of the CUDA process + Optional unlock operation arguments + + + + + A CUDA function or CUDA kernel @@ -98026,6 +121839,18 @@ + + + + + + + + + + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions must be set @@ -98035,6 +121860,14 @@ The CUmodule which contains the kernel CUDA abstraction layer object (= CUDA context) for this Kernel + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions must be set + before running the kernel. Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions are set directly. @@ -98145,6 +121978,107 @@ Dimension of grid of block of threads Z Dynamic shared memory size in Bytes + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions are set directly. + Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads + Dimension of grid of block of threads + + + + Loads the given CUDA kernel from the CUmodule. Block dimensions are set directly, + grid dimensions must be set before running the kernel. Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads + + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions are set directly. + Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads X + Dimension of block of threads Y + Dimension of block of threads Z + Dimension of grid of block of threads X + Dimension of grid of block of threads Y + + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions are set directly. + Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads X + Dimension of block of threads Y + Dimension of block of threads Z + Dimension of grid of block of threads X + Dimension of grid of block of threads Y + Dimension of grid of block of threads Z + + + + Loads the given CUDA kernel from the CUmodule. Block dimensions are set directly, + grid dimensions must be set before running the kernel. Shared memory size is set to 0. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads X + Dimension of block of threads Y + Dimension of block of threads Z + + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions must be set + before running the kernel. Shared memory size is set directly. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dynamic shared memory size in Bytes + + + + Loads the given CUDA kernel from the CUmodule. Block and Grid dimensions and shared memory size are set directly. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads (2D - z-component is discarded) + Dimension of grid of block of threads (3D) + Dynamic shared memory size in Bytes + + + + Loads the given CUDA kernel from the CUmodule. Block dimensions and shared memors size are set directly, + grid dimensions must be set before running the kernel. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads + Dynamic shared memory size in Bytes + + + + Loads the given CUDA kernel from the CUmodule. Block dimensions and shared memors size are set directly, + grid dimensions must be set before running the kernel. + + The kernel name as defined in the *.cu file + The CUmodule which contains the kernel + Dimension of block of threads X + Dimension of block of threads Y + Dimension of block of threads Z + Dimension of grid of block of threads X + Dimension of grid of block of threads Y + Dimension of grid of block of threads Z + Dynamic shared memory size in Bytes + Set the constant variable name to value valueThe constant variable must be defined in the CUDA module. @@ -98908,6 +122842,13 @@ Parameters as given by the kernel Time of execution in milliseconds (using GPU counter) + + + Executes the kernel on the device + + Config to launch + Parameters as given by the kernel + Executes the kernel on the device asynchronously @@ -99127,6 +123068,62 @@ driver can choose a different ratio if required to execute the function. + + + If this attribute is set, the kernel must launch with a valid cluster size specified. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + + + + The required cluster width in blocks. The values must either all be 0 or all be positive. + The validity of the cluster dimensions is otherwise checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + + + + The required cluster height in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + + + + The required cluster depth in blocks. The values must either all be 0 or + all be positive. The validity of the cluster dimensions is otherwise + checked at launch time. + If the value is set during compile time, it cannot be set at runtime. + Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + + + + Whether the function can be launched with non-portable cluster size. 1 is + allowed, 0 is disallowed. A non-portable cluster size may only function + on the specific SKUs the program is tested on. The launch might fail if + the program is run on a different hardware platform. + CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking + whether the desired size can be launched on the current device. + Portable Cluster Size + A portable cluster size is guaranteed to be functional on all compute + capabilities higher than the target compute capability. The portable + cluster size for sm_90 is 8 blocks per cluster. This value may increase + for future compute capabilities. + The specific hardware unit may support higher cluster sizes that's not + guaranteed to be portable. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + + + + The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + See ::cuFuncSetAttribute, ::cuKernelSetAttribute + + Sets the shared memory configuration for a device function. @@ -99374,6 +123371,44 @@ block size permitted by the device / function instead. Flags + + + Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM + Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + + Number of blocks to fit on SM + Size of the blocks + + + + Given the kernel function (\p func) and launch configuration + (\p config), return the maximum cluster size in \p* clusterSize. + The cluster dimensions in \p config are ignored. If func has a required + cluster size set (see::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p + clusterSize will reflect the required cluster size. + By default this function will always return a value that's portable on + future hardware. A higher value may be returned if the kernel function + allows non-portable cluster sizes. + This function will respect the compile time launch bounds. + + Launch configuration for the given kernel function + + + + Given the kernel function (\p func) and launch configuration + (\p config), return the maximum number of clusters that could co-exist + on the target device in \p* numClusters. + If the function has required cluster size already set (see + ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size + from config must either be unspecified or match the required size. + Without required sizes, the cluster size must be specified in config, + else the function will return an error. + Note that various attributes of the kernel function may affect occupancy + calculation. Runtime environment may affect how the hardware schedules + the clusters, so the calculated occupancy is not guaranteed to be achievable. + + Launch configuration for the given kernel function + Sets the grid dimensions according to block dimensions, so that each dimension has at least computeSize threads @@ -100563,6 +124598,149 @@ The base address must be the same one specified to . + + + A green context handle. This handle can be used safely from only one CPU thread at a time. + + + + + + + + + Indicates if this GreenContext instance created the wrapped green context and should be destroyed while disposing. + + + + + + + + + + + + + Create a new instace of green context. The instance is not the owner of the provided ctx and it won't be destroyed when disposing. + + + + + Creates a green context with a specified set of resources. + This API creates a green context with the resources specified in the descriptor \p desc and + returns it in the handle represented by \p phCtx.This API will retain the primary context on device \p dev, + which will is released when the green context is destroyed.It is advised to have the primary context active + before calling this API to avoid the heavy cost of triggering primary context initialization and + deinitialization multiple times. + The API does not set the green context current. In order to set it current, you need to explicitly set it current + by first converting the green context to a CUcontext using ::cuCtxFromGreenCtx and subsequently calling + ::cuCtxSetCurrent / ::cuCtxPushCurrent.It should be noted that a green context can be current to only one + thread at a time.There is no internal synchronization to make API calls accessing the same green context + from multiple threads work. + Note: The API is not supported on 32-bit platforms. + + Descriptor generated via ::cuDevResourceGenerateDesc which contains the set of resources to be used + Device on which to create the green context. + One of the supported green context creation flags. \p CU_GREEN_CTX_DEFAULT_STREAM is required. + + + + Create a new instace of a cuda green context from the given CudaStream + Note: doesn't throw an exception if the returned green context is NULL! + + The stream to query + + + + For dispose + + + + + Dispose + + + + + For IDisposable. + Note: If this instance created the wrapped CUcontext, it will be destroyed and can't be accessed by other threads anymore. + If this instance only was bound to an existing CUcontext, the wrapped CUcontext won't be destroyed. + + + + + + Converts a green context into the primary context + The API converts a green context into the primary context returned in \p pContext. It is important + to note that the converted context \p pContext is a normal primary context but with + the resources of the specified green context \p hCtx.Once converted, it can then + be used to set the context current with::cuCtxSetCurrent or with any of the CUDA APIs + that accept a CUcontext parameter. + Users are expected to call this API before calling any CUDA APIs that accept a + CUcontext. Failing to do so will result in the APIs returning::CUDA_ERROR_INVALID_CONTEXT. + + + + + Get green context resources - Get the \p type resources available to the green context represented by \p hCtx + + Type of resource to retrieve + + + + Records an event. + + Captures in \phEvent all the activities of the green context of \phCtx + at the time of this call. \phEvent and \phCtx must be from the same + CUDA context. Calls such as ::cuEventQuery() or ::cuGreenCtxWaitEvent() will + then examine or wait for completion of the work that was captured. Uses of + \p hCtx after this call do not modify \p hEvent. + + \note The API will return an error if the specified green context \p hCtx + has a stream in the capture mode. In such a case, the call will invalidate + all the conflicting captures. + + + + + Make a green context wait on an event + + Makes all future work submitted to green context \phCtx wait for all work + captured in \phEvent. The synchronization will be performed on the device + and will not block the calling CPU thread. See ::cuGreenCtxRecordEvent() + for details on what is captured by an event. + + \note The API will return an error and invalidate the capture if the specified + event \p hEvent is part of an ongoing capture sequence. + + + + + Create a stream for use in the green context + + Creates a stream for use in the specified green context \p greenCtx and returns a handle in \p phStream. + The stream can be destroyed by calling::cuStreamDestroy(). Note that the API ignores the context that + is current to the calling thread and creates a stream in the specified green context \p greenCtx. + + The supported values for \p flags are: + - ::CU_STREAM_NON_BLOCKING: This must be specified. It indicates that work running in the created + stream may run concurrently with work in the default stream, and that + the created stream should perform no implicit synchronization with the default stream. + + Specifying \p priority affects the scheduling priority of work in the stream. Priorities provide a + hint to preferentially run work with higher priority when possible, but do not preempt + already-running work or provide any other functional guarantee on execution order. + \p priority follows a convention where lower numbers represent higher priorities. + '0' represents default priority.The range of meaningful numerical priorities can + be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + outside the numerical range returned by::cuCtxGetStreamPriorityRange, + it will automatically be clamped to the lowest or the highest number in the range. + + Flags for stream creation. \p CU_STREAM_NON_BLOCKING must be specified. + Stream priority. Lower numbers represent higher priorities. See::cuCtxGetStreamPriorityRange for more information about meaningful stream priorities that can be passed. + + Common interface for OpenGL and DirectX graphics interop resources @@ -101802,6 +125980,19 @@ DeviceID + + + Create a new instace of managed Cuda. Retains the primary cuda context of the given device. The device object can be obtained for example from the OpenGL or DirectX API. + Using + + Device to use + + + + Create a new instace of managed Cuda. + Using + + For dispose diff --git a/src/external/ManagedCuda/NVRTC.XML b/src/external/ManagedCuda/NVRTC.XML index d3b2df27..2c45d1f7 100644 --- a/src/external/ManagedCuda/NVRTC.XML +++ b/src/external/ManagedCuda/NVRTC.XML @@ -46,12 +46,34 @@ + + + Retrieve the current size of the PCH Heap. + + + + + Set the size of the PCH Heap. + + + + + + + + + + + + + + @@ -67,6 +89,15 @@ + + + + + + + + + An NVRTCException is thrown, if any wrapped call to the NVRTC-library does not return . @@ -200,6 +231,90 @@ Compiled result. + + + nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin + generated by the previous compilation of \p prog.The value of + cubinSizeRet is set to 0 if the value specified to \c -arch is a + virtual architecture instead of an actual architecture. + + CUDA Runtime Compilation program. + Size of the generated cubin. + + + + nvrtcGetCUBIN stores the cubin generated by the previous compilation + of \p prog in the memory pointed by \p cubin.No cubin is available + if the value specified to \c -arch is a virtual architecture instead + of an actual architecture. + + prog CUDA Runtime Compilation program. + cubin Compiled and assembled result. + + + + nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM + generated by the previous compilation of \p prog.The value of + nvvmSizeRet is set to 0 if the program was not compiled with + -dlto. + + CUDA Runtime Compilation program. + Size of the generated NVVM. + + + + + nvrtcGetNVVM stores the NVVM generated by the previous compilation + of \p prog in the memory pointed by \p nvvm. + The program must have been compiled with -dlto, + otherwise will return an error. + + prog CUDA Runtime Compilation program. + nvvm Compiled result. + + + + + nvrtcGetLTOIRSize sets \p LTOIRSizeRet with the size of the LTO IR + generated by the previous compilation of \p prog.The value of + LTOIRSizeRet is set to 0 if the program was not compiled with + -dlto. + + CUDA Runtime Compilation program. + Size of the generated LTO IR. + + + + + nvrtcGetLTOIR stores the LTO IR generated by the previous compilation + of \p prog in the memory pointed by \p LTOIR. No LTO IR is available + if the program was compiled without \c -dlto. + + prog CUDA Runtime Compilation program. + LTOIR Compiled result. + + + + + nvrtcGetOptiXIRSize sets the value of \p optixirSizeRet with the size of the OptiX IR + generated by the previous compilation of \p prog. The value of + nvrtcGetOptiXIRSize is set to 0 if the program was compiled with + options incompatible with OptiX IR generation. + + prog CUDA Runtime Compilation program. + Size of the generated LTO IR. + + + + + nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation + of \p prog in the memory pointed by \p optixir. No OptiX IR is available + if the program was compiled with options incompatible with OptiX IR generation. + + prog CUDA Runtime Compilation program. + Optix IR Compiled result. + + sets logSizeRet with the size of the log generated by the previous compilation of prog (including the trailing NULL). @@ -243,6 +358,76 @@ initialized by the function to point to a C string containing the lowered (mangled) name corresponding to the provided name expression. + + + retrieve the current size of the PCH Heap. + + pointer to location where the size of the PCH Heap will be stored + + + + + Set the size of the PCH Heap. The requested size may be rounded up to a platform dependent + alignment (e.g.page size). If the PCH Heap has already been allocated, the heap memory will + be freed and a new PCH Heap will be allocated. + + requested size of the PCH Heap, in bytes + + + + + Returns the PCH creation status. + NVRTC_SUCCESS indicates that the PCH was successfully created. + NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED indicates that no PCH creation + was attempted, either because PCH functionality was not requested during + the preceding nvrtcCompileProgram call, or automatic PCH processing was + requested, and compiler chose not to create a PCH file. + NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED indicates that a PCH file could + potentially have been created, but the compiler ran out space in the PCH + heap. In this scenario, the nvrtcGetPCHHeapSizeRequired() can be used to + query the required heap size, the heap can be reallocated for this size with + nvrtcSetPCHHeapSize() and PCH creation may be reattempted again invoking + nvrtcCompileProgram() with a new NVRTC program instance. + NVRTC_ERROR_PCH_CREATE indicates that an error condition prevented the + PCH file from being created. + + CUDA Runtime Compilation program. + + + + + retrieve the required size of the PCH heap required to compile the given program. The size retrieved using this function is only valid if nvrtcGetPCHCreateStatus() returned NVRTC_SUCCESS or NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED + + CUDA Runtime Compilation program. + pointer to location where the required size of the PCH Heap will be stored + + + + + nvrtcSetFlowCallback registers a callback function that the compiler + will invoke at different points during a call to nvrtcCompileProgram, + and the callback function can decide whether to cancel compilation by + returning specific values. + + The callback function must satisfy the following constraints: + (1) Its signature should be: + int callback(void* param1, void* param2); + When invoking the callback, the compiler will always pass \p payload to + param1 so that the callback may make decisions based on \p payload.It'll + always pass NULL to param2 for now which is reserved for future extensions. + (2) It must return 1 to cancel compilation or 0 to continue. + Other return values are reserved for future use. + (3) It must return consistent values.Once it returns 1 at one point, it must + return 1 in all following invocations during the current nvrtcCompileProgram + call in progress. + (4) It must be thread-safe. + (5) It must not invoke any nvrtc/libnvvm/ptx APIs. + + + + + + CUDA Online Compiler API call result code. @@ -284,6 +469,21 @@ + + + + + + + + + + + + + + + the unit of compilation, and an opaque handle for a program. @@ -294,5 +494,10 @@ + + + Prototype for device memory release + + diff --git a/src/external/ManagedCuda/NVRTC.dll b/src/external/ManagedCuda/NVRTC.dll index 3961efb8..87a424ac 100644 Binary files a/src/external/ManagedCuda/NVRTC.dll and b/src/external/ManagedCuda/NVRTC.dll differ