diff --git a/CMakeLists.txt b/CMakeLists.txt
index 545ea697e..5319830fd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,13 @@ else()
     project( MAGMA LANGUAGES C CXX )
 endif()
 
+FIND_PROGRAM(PROGRAM_CCACHE ccache)
+IF (PROGRAM_CCACHE)
+    SET(CMAKE_CXX_COMPILER_LAUNCHER ${PROGRAM_CCACHE})
+    SET(CMAKE_C_COMPILER_LAUNCHER ${PROGRAM_CCACHE})
+    SET(CMAKE_CUDA_COMPILER_LAUNCHER ${PROGRAM_CCACHE})
+    SET(CMAKE_OPENCL_COMPILER_LAUNCHER ${PROGRAM_CCACHE})
+ENDIF ()
 
 # ----------------------------------------
 # to show compile commands, set this here or use 'make VERBOSE=1'
diff --git a/Makefile b/Makefile
index 6d5329926..7ca76c86d 100644
--- a/Makefile
+++ b/Makefile
@@ -261,7 +261,7 @@ else ifeq ($(BACKEND),hip)
 
     ## Suggestion by Mark (from SLATE)
     # Valid architecture numbers
-    # TODO: remove very old ones?
+    # TODO: remove veryold ones?
     VALID_GFXS = 600 601 602 700 701 702 703 704 705 801 802 803 805 810 900 902 904 906 908 909 90a 940 941 942 90c 1010 1011 1012 1030 1031 1032 1033
 
 
diff --git a/include/magma_auxiliary.h b/include/magma_auxiliary.h
index c4f06d55d..32434b575 100644
--- a/include/magma_auxiliary.h
+++ b/include/magma_auxiliary.h
@@ -81,6 +81,9 @@ magma_int_t magma_get_smlsize_divideconquer();
 magma_int_t
 magma_malloc( magma_ptr *ptr_ptr, size_t bytes );
 
+magma_int_t
+magma_malloc_async( magma_ptr* ptrPtr, size_t size, magma_queue_t queue);
+
 magma_int_t
 magma_malloc_cpu( void **ptr_ptr, size_t bytes );
 
@@ -93,6 +96,9 @@ magma_free_cpu( void *ptr );
 #define magma_free( ptr ) \
         magma_free_internal( ptr, __func__, __FILE__, __LINE__ )
 
+#define magma_free_async( ptr, queue ) \
+        magma_free_internal_async( ptr, __func__, __FILE__, __LINE__, queue )
+
 #define magma_free_pinned( ptr ) \
         magma_free_pinned_internal( ptr, __func__, __FILE__, __LINE__ )
 
@@ -101,6 +107,11 @@ magma_free_internal(
     magma_ptr ptr,
     const char* func, const char* file, int line );
 
+magma_int_t
+magma_free_internal_async(
+    magma_ptr ptr,
+    const char* func, const char* file, int line, magma_queue_t queue );
+
 magma_int_t
 magma_free_pinned_internal(
     void *ptr,
@@ -128,24 +139,45 @@ magma_memset_async(void * ptr, int value, size_t count, magma_queue_t queue);
 /// Type-safe version of magma_malloc(), for magma_int_t arrays. Allocates n*sizeof(magma_int_t) bytes.
 static inline magma_int_t magma_imalloc( magmaInt_ptr           *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(magma_int_t)        ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for magma_int_t arrays. Allocates n*sizeof(magma_int_t) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_imalloc_async( magmaInt_ptr           *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(magma_int_t), queue ); }
+
 /// Type-safe version of magma_malloc(), for magma_index_t arrays. Allocates n*sizeof(magma_index_t) bytes.
 static inline magma_int_t magma_index_malloc( magmaIndex_ptr    *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(magma_index_t)      ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for magma_index_t arrays. Allocates n*sizeof(magma_index_t) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_index_malloc_async( magmaIndex_ptr    *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(magma_index_t), queue ); }
+
 /// Type-safe version of magma_malloc(), for magma_uindex_t arrays. Allocates n*sizeof(magma_uindex_t) bytes.
 static inline magma_int_t magma_uindex_malloc( magmaUIndex_ptr    *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(magma_uindex_t)      ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for magma_uindex_t arrays. Allocates n*sizeof(magma_uindex_t) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_uindex_malloc_async( magmaUIndex_ptr    *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(magma_uindex_t), queue); }
+
 /// Type-safe version of magma_malloc(), for float arrays. Allocates n*sizeof(float) bytes.
 static inline magma_int_t magma_smalloc( magmaFloat_ptr         *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(float)              ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for float arrays. Allocates n*sizeof(float) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_smalloc_async( magmaFloat_ptr         *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(float), queue); }
+
 /// Type-safe version of magma_malloc(), for double arrays. Allocates n*sizeof(double) bytes.
 static inline magma_int_t magma_dmalloc( magmaDouble_ptr        *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(double)             ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for double arrays. Allocates n*sizeof(double) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_dmalloc_async( magmaDouble_ptr        *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(double), queue); }
+
 /// Type-safe version of magma_malloc(), for magmaFloatComplex arrays. Allocates n*sizeof(magmaFloatComplex) bytes.
 static inline magma_int_t magma_cmalloc( magmaFloatComplex_ptr  *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(magmaFloatComplex)  ); }
 
+/// Type-safe asynchronous version of magma_malloc(), for magmaFloatComplex arrays. Allocates n*sizeof(magmaFloatComplex) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_cmalloc_async( magmaFloatComplex_ptr  *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(magmaFloatComplex), queue ); }
+
 /// Type-safe version of magma_malloc(), for magmaDoubleComplex arrays. Allocates n*sizeof(magmaDoubleComplex) bytes.
 static inline magma_int_t magma_zmalloc( magmaDoubleComplex_ptr *ptr_ptr, size_t n ) { return magma_malloc( (magma_ptr*) ptr_ptr, n*sizeof(magmaDoubleComplex) ); }
 
+/// Type-safe asynchronous version of magma_malloc_async(), for magmaDoubleComplex arrays. Allocates n*sizeof(magmaDoubleComplex) bytes using CUDA stream specified in queue.
+static inline magma_int_t magma_zmalloc_async( magmaDoubleComplex_ptr *ptr_ptr, size_t n, magma_queue_t queue ) { return magma_malloc_async( (magma_ptr*) ptr_ptr, n*sizeof(magmaDoubleComplex), queue ); }
+
 /// @}
 
 
diff --git a/include/magma_z.h b/include/magma_z.h
index d6b86e4d6..d9923b11c 100644
--- a/include/magma_z.h
+++ b/include/magma_z.h
@@ -477,6 +477,16 @@ magma_zgerbt_gpu(
     magmaDoubleComplex *U, magmaDoubleComplex *V,
     magma_int_t *info);
 
+// CUDA MAGMA only
+magma_int_t
+magma_zgerbt_gpu_async(
+        const magma_bool_t gen, const magma_int_t n, const magma_int_t nrhs,
+        magmaDoubleComplex_ptr const dA, magma_int_t const ldda,
+        magmaDoubleComplex_ptr const dB, magma_int_t const lddb,
+        magmaDoubleComplex_ptr const dU, magmaDoubleComplex_ptr const dV,
+        magma_int_t *info,
+        magma_queue_t queue);
+
 // CUDA MAGMA only
 magma_int_t
 magma_zgerfs_nopiv_gpu(
@@ -488,6 +498,20 @@ magma_zgerfs_nopiv_gpu(
     magma_int_t *iter,
     magma_int_t *info);
 
+// CUDA MAGMA only
+magma_int_t
+magma_zgerfs_nopiv_gpu_async(
+        magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magmaDoubleComplex_ptr dX, magma_int_t lddx,
+        magmaDoubleComplex_ptr dworkd, magmaDoubleComplex_ptr dAF,
+        magma_int_t *iter,
+        magma_int_t *info,
+        magma_int_t iter_max,
+        double bwdmax,
+        magma_queue_t queue);
+
 magma_int_t
 magma_zgesdd(
     magma_vec_t jobz, magma_int_t m, magma_int_t n,
@@ -525,6 +549,13 @@ magma_zgesv_nopiv_gpu(
     magmaDoubleComplex_ptr dB, magma_int_t lddb,
     magma_int_t *info);
 
+magma_int_t
+magma_zgesv_nopiv_gpu_async(
+        magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magma_int_t *info, magma_queue_t queue );
+
 // CUDA MAGMA only
 magma_int_t
 magma_zgesv_rbt(
@@ -533,6 +564,26 @@ magma_zgesv_rbt(
     magmaDoubleComplex *B, magma_int_t ldb,
     magma_int_t *info);
 
+// CUDA MAGMA only
+magma_int_t
+magma_zgesv_rbt_async(
+        const magma_bool_t refine, const magma_int_t n, const magma_int_t nrhs,
+        const magmaDoubleComplex *const dA, const magma_int_t lda,
+        magmaDoubleComplex *const dB, const magma_int_t ldb,
+        magma_int_t *info,
+        const magma_int_t iter_max, const double bwdmax,
+        magma_queue_t queue );
+
+// CUDA MAGMA only
+magma_int_t
+magma_zgesv_rbt_refine_async(
+        const magma_int_t n, const magma_int_t nrhs,
+        const magmaDoubleComplex *const dA_, const magma_int_t lda,
+        magmaDoubleComplex *const dB_, const magma_int_t ldb,
+        magma_int_t *info,
+        const magma_int_t iter_max, const double bwdmax,
+        magma_queue_t queue);
+
 magma_int_t
 magma_zgesvd(
     magma_vec_t jobu, magma_vec_t jobvt, magma_int_t m, magma_int_t n,
@@ -676,6 +727,13 @@ magma_zgetrf_nopiv_gpu(
     magmaDoubleComplex_ptr dA, magma_int_t ldda,
     magma_int_t *info);
 
+magma_int_t
+magma_zgetrf_nopiv_gpu_async(
+        magma_int_t m, magma_int_t n,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magma_int_t *info,
+        magma_queue_t queue);
+
 magma_int_t
 magma_zgetri_gpu(
     magma_int_t n,
@@ -721,6 +779,13 @@ magma_zgetrs_nopiv_gpu(
     magmaDoubleComplex_ptr dB, magma_int_t lddb,
     magma_int_t *info);
 
+magma_int_t
+magma_zgetrs_nopiv_gpu_async(
+        magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magma_int_t *info, magma_queue_t queue);
+
 // ------------------------------------------------------------ zhe routines
 magma_int_t
 magma_zheevd(
diff --git a/include/magmablas_z.h b/include/magmablas_z.h
index 8a10d4622..d8e0bf029 100644
--- a/include/magmablas_z.h
+++ b/include/magmablas_z.h
@@ -496,6 +496,13 @@ magmablas_ztrtri_diag(
     magmaDoubleComplex_ptr d_dinvA,
     magma_queue_t queue );
 
+void
+magmablas_ztrtri_diag_async(
+    magma_uplo_t uplo, magma_diag_t diag, magma_int_t n,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr d_dinvA,
+    magma_queue_t queue );
+
   /*
    * to cleanup (alphabetical order)
    */
@@ -757,6 +764,15 @@ magmablas_ztrsm(
     magmaDoubleComplex_ptr       dB, magma_int_t lddb,
     magma_queue_t queue );
 
+void
+magmablas_ztrsm_async(
+    magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag,
+    magma_int_t m, magma_int_t n,
+    magmaDoubleComplex alpha,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr       dB, magma_int_t lddb,
+    magma_queue_t queue );
+
 void
 magmablas_ztrsm_outofplace(
     magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag,
diff --git a/interface_cuda/alloc.cpp b/interface_cuda/alloc.cpp
index d7e178baa..da30b1143 100644
--- a/interface_cuda/alloc.cpp
+++ b/interface_cuda/alloc.cpp
@@ -23,6 +23,7 @@
 #include "error.h"
 
 //#ifdef MAGMA_HAVE_CUDA
+#define NOPINNED_ALLOC // Fix for multi GPU systems running jemalloc, where cudaHostAlloc and cudaHostRegister kill parallelism across multiple threads using multiple GPUs
 
 
 #ifdef DEBUG_MEMORY
@@ -77,6 +78,55 @@ magma_malloc( magma_ptr* ptrPtr, size_t size )
 }
 
 
+/***************************************************************************//**
+    Allocates memory on the GPU. CUDA imposes a synchronization.
+    Use magma_free() to free this memory.
+
+    @param[out]
+    ptrPtr  On output, set to the pointer that was allocated.
+            NULL on failure.
+
+    @param[in]
+    size    Size in bytes to allocate. If size = 0, allocates some minimal size.
+
+    @param[in]
+    queue    Magma queue whose CUDA stream is used to put the cudaMalloc on.
+
+    @return MAGMA_SUCCESS
+    @return MAGMA_ERR_DEVICE_ALLOC on failure
+
+    Type-safe versions avoid the need for a (void**) cast and explicit sizeof.
+    @see magma_smalloc_async
+    @see magma_dmalloc_async
+    @see magma_cmalloc_async
+    @see magma_zmalloc_async
+    @see magma_imalloc_async
+    @see magma_index_malloc_async
+    @see magma_uindex_malloc_async
+
+    @ingroup magma_malloc
+    
+*******************************************************************************/
+extern "C" magma_int_t
+magma_malloc_async( magma_ptr* ptrPtr, size_t size, magma_queue_t queue)
+{
+    // malloc and free sometimes don't work for size=0, so allocate some minimal size
+    if ( size == 0 )
+        size = sizeof(magmaDoubleComplex);
+    if ( cudaSuccess != cudaMallocAsync( ptrPtr, size, queue->cuda_stream() )) {
+        return MAGMA_ERR_DEVICE_ALLOC;
+    }
+
+    #ifdef DEBUG_MEMORY
+    g_pointers_mutex.lock();
+    g_pointers_dev[ *ptrPtr ] = size;
+    g_pointers_mutex.unlock();
+    #endif
+
+    return MAGMA_SUCCESS;
+}
+
+
 /***************************************************************************//**
     @fn magma_free( ptr )
 
@@ -114,6 +164,45 @@ magma_free_internal( magma_ptr ptr,
 }
 
 
+/***************************************************************************//**
+    @fn magma_free( ptr )
+
+    Frees GPU memory previously allocated by magma_malloc().
+
+    @param[in]
+    ptr     Pointer to free.
+    @param[in]
+    queue   MAGMA queue to use the CYDA stream to free on.
+
+    @return MAGMA_SUCCESS
+    @return MAGMA_ERR_INVALID_PTR on failure
+
+    @ingroup magma_malloc
+*******************************************************************************/
+extern "C" magma_int_t
+magma_free_internal_async( magma_ptr ptr,
+    const char* func, const char* file, int line, magma_queue_t queue )
+{
+    #ifdef DEBUG_MEMORY
+    g_pointers_mutex.lock();
+    if ( ptr != NULL && g_pointers_dev.count( ptr ) == 0 ) {
+        fprintf( stderr, "magma_free( %p ) that wasn't allocated with magma_malloc.\n", ptr );
+    }
+    else {
+        g_pointers_dev.erase( ptr );
+    }
+    g_pointers_mutex.unlock();
+    #endif
+
+    cudaError_t err = cudaFreeAsync( ptr, queue->cuda_stream() );
+    check_xerror( err, func, file, line );
+    if ( err != cudaSuccess ) {
+        return MAGMA_ERR_INVALID_PTR;
+    }
+    return MAGMA_SUCCESS;
+}
+
+
 /***************************************************************************//**
     Allocate size bytes on CPU.
     The purpose of using this instead of malloc is to properly align arrays
@@ -242,10 +331,15 @@ magma_free_cpu( void* ptr )
 extern "C" magma_int_t
 magma_malloc_pinned( void** ptrPtr, size_t size )
 {
+    #ifdef NOPINNED_ALLOC
+    return magma_malloc_cpu( ptrPtr, size );
+    #endif
+
     // malloc and free sometimes don't work for size=0, so allocate some minimal size
     // (for pinned memory, the error is detected in free)
     if ( size == 0 )
         size = sizeof(magmaDoubleComplex);
+
     if ( cudaSuccess != cudaHostAlloc( ptrPtr, size, cudaHostAllocPortable )) {
         return MAGMA_ERR_HOST_ALLOC;
     }
@@ -277,6 +371,10 @@ extern "C" magma_int_t
 magma_free_pinned_internal( void* ptr,
     const char* func, const char* file, int line )
 {
+    #ifdef NOPINNED_ALLOC
+    return magma_free_cpu( ptr );
+    #endif
+
     #ifdef DEBUG_MEMORY
     g_pointers_mutex.lock();
     if ( ptr != NULL && g_pointers_pin.count( ptr ) == 0 ) {
@@ -293,6 +391,7 @@ magma_free_pinned_internal( void* ptr,
     if ( cudaSuccess != err ) {
         return MAGMA_ERR_INVALID_PTR;
     }
+
     return MAGMA_SUCCESS;
 }
 
@@ -316,16 +415,37 @@ magma_mem_info(size_t * freeMem, size_t * totalMem) {
     return MAGMA_SUCCESS;
 }
 
+/***************************************************************************//**
+    Sets memory chunk pointed to by ptr of size count to the byte value of value.
 
+    @param[in]
+    ptr    Address of the chunk of GPU memory to set.
+
+    @param[in]
+    count  Number of bytes to set starting at ptr.
+
+    @return CUDA_SUCCESS
+    @return CUDA_ERROR_* code on failure
+
+*******************************************************************************/
 extern "C" magma_int_t
 magma_memset(void * ptr, int value, size_t count) {
     return cudaMemset(ptr, value, count);
 }
 
+/***************************************************************************//**
+
+    @copydoc magma_memset
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
 extern "C" magma_int_t
 magma_memset_async(void * ptr, int value, size_t count, magma_queue_t queue) {
 #ifdef MAGMA_HAVE_CUDA
-//    return cudaMemsetAsync(ptr, value, count, queue);
     return cudaMemsetAsync(ptr, value, count, queue->cuda_stream());
 #elif defined(MAGMA_HAVE_HIP)
     return hipMemsetAsync(ptr, value, count, queue->hip_stream());
diff --git a/magmablas/ztrsm.cu b/magmablas/ztrsm.cu
index ff63b4471..8474e4aae 100644
--- a/magmablas/ztrsm.cu
+++ b/magmablas/ztrsm.cu
@@ -380,6 +380,244 @@ void magmablas_ztrsm_outofplace(
     }
 }
 
+extern "C"
+void magmablas_ztrsm_outofplace_async(
+    magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag,
+    magma_int_t m, magma_int_t n,
+    magmaDoubleComplex alpha,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr       dB, magma_int_t lddb,
+    magmaDoubleComplex_ptr       dX, magma_int_t lddx,
+    magma_int_t flag,
+    magmaDoubleComplex_ptr d_dinvA, magma_int_t dinvA_length,
+    magma_queue_t queue )
+{
+    #define dA(i_, j_) (dA + (i_) + (j_)*ldda)
+    #define dB(i_, j_) (dB + (i_) + (j_)*lddb)
+    #define dX(i_, j_) (dX + (i_) + (j_)*lddx)
+    #define d_dinvA(i_) (d_dinvA + (i_)*NB)
+
+    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
+    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
+    const magmaDoubleComplex c_zero    = MAGMA_Z_ZERO;
+
+    magma_int_t i, jb;
+    magma_int_t nrowA = (side == MagmaLeft ? m : n);
+
+    magma_int_t min_dinvA_length;
+    if ( side == MagmaLeft ) {
+        min_dinvA_length = magma_roundup( m, NB )*NB;
+    }
+    else {
+        min_dinvA_length = magma_roundup( n, NB )*NB;
+    }
+
+    magma_int_t info = 0;
+    if ( side != MagmaLeft && side != MagmaRight ) {
+        info = -1;
+    } else if ( uplo != MagmaUpper && uplo != MagmaLower ) {
+        info = -2;
+    } else if ( transA != MagmaNoTrans && transA != MagmaTrans && transA != MagmaConjTrans ) {
+        info = -3;
+    } else if ( diag != MagmaUnit && diag != MagmaNonUnit ) {
+        info = -4;
+    } else if (m < 0) {
+        info = -5;
+    } else if (n < 0) {
+        info = -6;
+    } else if (dA == NULL) {
+        info = -8;
+    } else if (ldda < max(1,nrowA)) {
+        info = -9;
+    } else if (dB == NULL) {
+        info = -10;
+    } else if (lddb < max(1,m)) {
+        info = -11;
+    } else if (dX == NULL) {
+        info = -12;
+    } else if (lddx < max(1,m)) {
+        info = -13;
+    } else if (d_dinvA == NULL) {
+        info = -15;
+    } else if (dinvA_length < min_dinvA_length) {
+        info = -16;
+    }
+
+    if (info != 0) {
+        magma_xerbla( __func__, -(info) );
+        return;
+    }
+
+    // quick return if possible.
+    if (m == 0 || n == 0)
+        return;
+
+    if (side == MagmaLeft) {
+        // invert diagonal blocks
+        if (flag)
+            magmablas_ztrtri_diag_async( uplo, diag, m, dA, ldda, d_dinvA, queue );
+
+        if (transA == MagmaNoTrans) {
+            if (uplo == MagmaLower) {
+                // left, lower no-transpose
+                // handle first block separately with alpha
+                jb = min(NB, m);
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans, jb, n, jb, alpha, d_dinvA(0), NB, dB, lddb, c_zero, dX, lddx, queue );
+                if (NB < m) {
+                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, m-NB, n, NB, c_neg_one, dA(NB,0), ldda, dX, lddx, alpha, dB(NB,0), lddb, queue );
+
+                    // remaining blocks
+                    for( i=NB; i < m; i += NB ) {
+                        jb = min(m-i, NB);
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, jb, n, jb, c_one, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                        if (i+NB >= m)
+                            break;
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m-i-NB, n, NB, c_neg_one, dA(i+NB,i), ldda, dX(i,0), lddx, c_one, dB(i+NB,0), lddb, queue );
+                    }
+                }
+            }
+            else {
+                // left, upper no-transpose
+                // handle first block separately with alpha
+                jb = (m % NB == 0) ? NB : (m % NB);
+                i = m-jb;
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans, jb, n, jb, alpha, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                if (i-NB >= 0) {
+                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, n, jb, c_neg_one, dA(0,i), ldda, dX(i,0), lddx, alpha, dB, lddb, queue );
+
+                    // remaining blocks
+                    for( i=m-jb-NB; i >= 0; i -= NB ) {
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, NB, n, NB, c_one, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                        if (i-NB < 0)
+                            break;
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, i, n, NB, c_neg_one, dA(0,i), ldda, dX(i,0), lddx, c_one, dB, lddb, queue );
+                    }
+                }
+            }
+        }
+        else {  // transA == MagmaTrans || transA == MagmaConjTrans
+            if (uplo == MagmaLower) {
+                // left, lower transpose
+                // handle first block separately with alpha
+                jb = (m % NB == 0) ? NB : (m % NB);
+                i = m-jb;
+                magma_zgemm( transA, MagmaNoTrans, jb, n, jb, alpha, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                if (i-NB >= 0) {
+                    magma_zgemm( transA, MagmaNoTrans, i, n, jb, c_neg_one, dA(i,0), ldda, dX(i,0), lddx, alpha, dB, lddb, queue );
+
+                    // remaining blocks
+                    for( i=m-jb-NB; i >= 0; i -= NB ) {
+                        magma_zgemm( transA, MagmaNoTrans, NB, n, NB, c_one, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                        if (i-NB < 0)
+                            break;
+                        magma_zgemm( transA, MagmaNoTrans, i, n, NB, c_neg_one, dA(i,0), ldda, dX(i,0), lddx, c_one, dB, lddb, queue );
+                    }
+                }
+            }
+            else {
+                // left, upper transpose
+                // handle first block separately with alpha
+                jb = min(NB, m);
+                magma_zgemm( transA, MagmaNoTrans, jb, n, jb, alpha, d_dinvA(0), NB, dB, lddb, c_zero, dX, lddx, queue );
+                if (NB < m) {
+                    magma_zgemm( transA, MagmaNoTrans, m-NB, n, NB, c_neg_one, dA(0,NB), ldda, dX, lddx, alpha, dB(NB,0), lddb, queue );
+
+                    // remaining blocks
+                    for( i=NB; i < m; i += NB ) {
+                        jb = min(m-i, NB);
+                        magma_zgemm( transA, MagmaNoTrans, jb, n, jb, c_one, d_dinvA(i), NB, dB(i,0), lddb, c_zero, dX(i,0), lddx, queue );
+                        if (i+NB >= m)
+                            break;
+                        magma_zgemm( transA, MagmaNoTrans, m-i-NB, n, NB, c_neg_one, dA(i,i+NB), ldda, dX(i,0), lddx, c_one, dB(i+NB,0), lddb, queue );
+                    }
+                }
+            }
+        }
+    }
+    else {  // side == MagmaRight
+        // invert diagonal blocks
+        if (flag)
+            magmablas_ztrtri_diag_async( uplo, diag, n, dA, ldda, d_dinvA, queue );
+
+        if (transA == MagmaNoTrans) {
+            if (uplo == MagmaLower) {
+                // right, lower no-transpose
+                // handle first block separately with alpha
+                jb = (n % NB == 0) ? NB : (n % NB);
+                i = n-jb;
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, jb, jb, alpha, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                if (i-NB >= 0) {
+                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, i, jb, c_neg_one, dX(0,i), lddx, dA(i,0), ldda, alpha, dB, lddb, queue );
+
+                    // remaining blocks
+                    for( i=n-jb-NB; i >= 0; i -= NB ) {
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, NB, NB, c_one, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                        if (i-NB < 0)
+                            break;
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, i, NB, c_neg_one, dX(0,i), lddx, dA(i,0), ldda, c_one, dB, lddb, queue );
+                    }
+                }
+            }
+            else {
+                // right, upper no-transpose
+                // handle first block separately with alpha
+                jb = min(NB, n);
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, jb, jb, alpha, dB, lddb, d_dinvA(0), NB, c_zero, dX, lddx, queue );
+                if (NB < n) {
+                    magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n-NB, NB, c_neg_one, dX, lddx, dA(0,NB), ldda, alpha, dB(0,NB), lddb, queue );
+
+                    // remaining blocks
+                    for( i=NB; i < n; i += NB ) {
+                        jb = min(NB, n-i);
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, jb, jb, c_one, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                        if (i+NB >= n)
+                            break;
+                        magma_zgemm( MagmaNoTrans, MagmaNoTrans, m, n-i-NB, NB, c_neg_one, dX(0,i), lddx, dA(i,i+NB), ldda, c_one, dB(0,i+NB), lddb, queue );
+                    }
+                }
+            }
+        }
+        else { // transA == MagmaTrans || transA == MagmaConjTrans
+            if (uplo == MagmaLower) {
+                // right, lower transpose
+                // handle first block separately with alpha
+                jb = min(NB, n);
+                magma_zgemm( MagmaNoTrans, transA, m, jb, jb, alpha, dB, lddb, d_dinvA(0), NB, c_zero, dX, lddx, queue );
+                if (NB < n) {
+                    magma_zgemm( MagmaNoTrans, transA, m, n-NB, NB, c_neg_one, dX, lddx, dA(NB,0), ldda, alpha, dB(0,NB), lddb, queue );
+
+                    // remaining blocks
+                    for( i=NB; i < n; i += NB ) {
+                        jb = min(NB, n-i);
+                        magma_zgemm( MagmaNoTrans, transA, m, jb, jb, c_one, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                        if (i+NB >= n)
+                            break;
+                        magma_zgemm( MagmaNoTrans, transA, m, n-i-NB, NB, c_neg_one, dX(0,i), lddx, dA(NB+i,i), ldda, c_one, dB(0,i+NB), lddb, queue );
+                    }
+                }
+            }
+            else {
+                // right, upper transpose
+                // handle first block separately with alpha
+                jb = (n % NB == 0) ? NB : (n % NB);
+                i = n-jb;
+                magma_zgemm( MagmaNoTrans, transA, m, jb, jb, alpha, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                if (i-NB >= 0) {
+                    magma_zgemm( MagmaNoTrans, transA, m, i, jb, c_neg_one, dX(0,i), lddx, dA(0,i), ldda, alpha, dB, lddb, queue );
+
+                    // remaining blocks
+                    for( i=n-jb-NB; i >= 0; i -= NB ) {
+                        magma_zgemm( MagmaNoTrans, transA, m, NB, NB, c_one, dB(0,i), lddb, d_dinvA(i), NB, c_zero, dX(0,i), lddx, queue );
+                        if (i-NB < 0)
+                            break;
+                        magma_zgemm( MagmaNoTrans, transA, m, i, NB, c_neg_one, dX(0,i), lddx, dA(0,i), ldda, c_one, dB, lddb, queue );
+                    }
+                }
+            }
+        }
+    }
+}
+
 
 /***************************************************************************//**
     Similar to magmablas_ztrsm_outofplace(), but copies result dX back to dB,
@@ -407,6 +645,25 @@ void magmablas_ztrsm_work(
     magmablas_zlacpy( MagmaFull, m, n, dX, lddx, dB, lddb, queue );
 }
 
+extern "C"
+void magmablas_ztrsm_work_async(
+    magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag,
+    magma_int_t m, magma_int_t n,
+    magmaDoubleComplex alpha,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr       dB, magma_int_t lddb,
+    magmaDoubleComplex_ptr       dX, magma_int_t lddx,
+    magma_int_t flag,
+    magmaDoubleComplex_ptr d_dinvA, magma_int_t dinvA_length,
+    magma_queue_t queue )
+{
+    magmablas_ztrsm_outofplace_async( side, uplo, transA, diag, m, n, alpha,
+                                  dA, ldda, dB, lddb, dX, lddx, flag,
+                                  d_dinvA, dinvA_length, queue );
+    // copy X to B
+    magmablas_zlacpy( MagmaFull, m, n, dX, lddx, dB, lddb, queue );
+}
+
 
 /***************************************************************************//**
     Similar to magmablas_ztrsm_outofplace(), but allocates dX and d_dinvA
@@ -484,3 +741,82 @@ void magmablas_ztrsm(
     magma_free( d_dinvA );
     magma_free( dX );
 }
+
+/***************************************************************************//**
+
+    @copydoc magmablas_ztrsm
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C"
+void magmablas_ztrsm_async(
+    magma_side_t side, magma_uplo_t uplo, magma_trans_t transA, magma_diag_t diag,
+    magma_int_t m, magma_int_t n,
+    magmaDoubleComplex alpha,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr       dB, magma_int_t lddb,
+    magma_queue_t queue )
+{
+    const magma_int_t nrowA = (side == MagmaLeft ? m : n);
+
+    magma_int_t info = 0;
+    if ( side != MagmaLeft && side != MagmaRight ) {
+        info = -1;
+    } else if ( uplo != MagmaUpper && uplo != MagmaLower ) {
+        info = -2;
+    } else if ( transA != MagmaNoTrans && transA != MagmaTrans && transA != MagmaConjTrans ) {
+        info = -3;
+    } else if ( diag != MagmaUnit && diag != MagmaNonUnit ) {
+        info = -4;
+    } else if (m < 0) {
+        info = -5;
+    } else if (n < 0) {
+        info = -6;
+    } else if (dA == NULL) {
+        info = -8;
+    } else if (ldda < max(1,nrowA)) {
+        info = -9;
+    } else if (dB == NULL) {
+        info = -10;
+    } else if (lddb < max(1,m)) {
+        info = -11;
+    }
+
+    if (info != 0) {
+        magma_xerbla( __func__, -(info) );
+        return;
+    }
+
+    magmaDoubleComplex_ptr d_dinvA=NULL, dX=NULL;
+    const magma_int_t lddx = magma_roundup( m, 32 );
+    const magma_int_t size_x = lddx*n;
+    magma_int_t dinvA_length;
+    if ( side == MagmaLeft ) {
+        dinvA_length = magma_roundup( m, NB )*NB;
+    }
+    else {
+        dinvA_length = magma_roundup( n, NB )*NB;
+    }
+
+    magma_zmalloc_async( &d_dinvA, dinvA_length, queue );
+    magma_zmalloc_async( &dX, size_x, queue );
+
+    if ( d_dinvA == NULL || dX == NULL ) {
+        info = MAGMA_ERR_DEVICE_ALLOC;
+        magma_xerbla( __func__, -(info) );
+        // continue to free
+    }
+    else {
+        magmablas_zlaset( MagmaFull, dinvA_length, 1, MAGMA_Z_ZERO, MAGMA_Z_ZERO, d_dinvA, dinvA_length, queue );
+        magmablas_zlaset( MagmaFull, m, n, MAGMA_Z_ZERO, MAGMA_Z_ZERO, dX, lddx, queue );
+        magmablas_ztrsm_work_async( side, uplo, transA, diag, m, n, alpha,
+                                dA, ldda, dB, lddb, dX, lddx, 1, d_dinvA, dinvA_length, queue );
+    }
+
+    magma_free_async( d_dinvA, queue );
+    magma_free_async( dX, queue );
+}
diff --git a/magmablas/ztrtri_diag.cu b/magmablas/ztrtri_diag.cu
index 89f057971..8207b4dba 100644
--- a/magmablas/ztrtri_diag.cu
+++ b/magmablas/ztrtri_diag.cu
@@ -178,3 +178,116 @@ magmablas_ztrtri_diag(
         }
     }
 }
+
+
+/***************************************************************************//**
+
+    @copydoc magmablas_ztrtri_diag
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" void
+magmablas_ztrtri_diag_async(
+    magma_uplo_t uplo, magma_diag_t diag, magma_int_t n,
+    magmaDoubleComplex_const_ptr dA, magma_int_t ldda,
+    magmaDoubleComplex_ptr d_dinvA,
+    magma_queue_t queue)
+{
+    magma_int_t info = 0;
+    if (uplo != MagmaLower && uplo != MagmaUpper)
+        info = -1;
+    else if (diag != MagmaNonUnit && diag != MagmaUnit)
+        info = -2;
+    else if (n < 0)
+        info = -3;
+    else if (ldda < n)
+        info = -5;
+
+    if (info != 0) {
+        magma_xerbla( __func__, -(info) );
+        return;  //info
+    }
+
+    const int nblocks = magma_ceildiv( n, IB );
+
+    cudaMemsetAsync( d_dinvA, 0, magma_roundup( n, NB )*NB * sizeof(magmaDoubleComplex), queue->cuda_stream() );
+
+    if ( uplo == MagmaLower ) {
+        // invert diagonal IB x IB inner blocks
+        ztrtri_diag_lower_kernel
+            <<< nblocks, IB, 0, queue->cuda_stream() >>>
+            ( diag, n, dA, ldda, d_dinvA );
+
+        // build up NB x NB blocks (assuming IB=16 here):
+        // use   16 x 16  blocks to build  32 x 32  blocks,  1 x (1 x npages) grid,  4 x 4 threads;
+        // then  32 x 32  blocks to build  64 x 64  blocks,  1 x (2 x npages) grid,  8 x 4 threads;
+        // then  64 x 64  blocks to build 128 x 128 blocks,  1 x (4 x npages) grid, 16 x 4 threads;
+        // then 128 x 128 blocks to build 256 x 256 blocks,  2 x (8 x npages) grid, 16 x 4 threads.
+        for( int jb=IB; jb < NB; jb *= 2 ) {
+            const int kb = jb*2;
+            const int npages = magma_ceildiv( n, kb );
+            const dim3 threads( (jb <= 32 ? jb/4 : 16), 4 );
+            const dim3 grid( jb/(threads.x*threads.y), npages*(jb/16) );  // emulate 3D grid: NX * (NY*npages), for CUDA ARCH 1.x
+
+            //printf( "n %d, jb %d, grid %d x %d (%d x %d)\n", n, jb, grid.x, grid.y, grid.y / npages, npages );
+            switch (jb) {
+                case 16:
+                    triple_zgemm16_part1_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm16_part2_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                case 32:
+                    triple_zgemm32_part1_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm32_part2_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                case 64:
+                    triple_zgemm64_part1_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm64_part2_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                default:
+                    triple_zgemm_above64_part1_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm_above64_part2_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm_above64_part3_lower_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+            }
+            if ( kb >= n ) break;
+        }
+    }
+    else {
+        ztrtri_diag_upper_kernel
+            <<< nblocks, IB, 0, queue->cuda_stream() >>>
+            ( diag, n, dA, ldda, d_dinvA );
+
+        // update the inverse up to the size of IB
+        for( int jb=IB; jb < NB; jb *= 2 ) {
+            const int kb = jb*2;
+            const int npages = magma_ceildiv( n, kb );
+            const dim3 threads( (jb <= 32 ? jb/4 : 16), 4 );
+            const dim3 grid( jb/(threads.x*threads.y), npages*(jb/16) );  // emulate 3D grid: NX * (NY*npages), for CUDA ARCH 1.x
+
+            switch (jb) {
+                case 16:
+                    triple_zgemm16_part1_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm16_part2_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                case 32:
+                    triple_zgemm32_part1_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm32_part2_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                case 64:
+                    triple_zgemm64_part1_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm64_part2_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+                default:
+                    triple_zgemm_above64_part1_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm_above64_part2_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    triple_zgemm_above64_part3_upper_kernel<<< grid, threads, 0, queue->cuda_stream() >>>( n, dA, ldda, d_dinvA, jb, npages );
+                    break;
+            }
+            if ( kb >= n ) break;
+        }
+    }
+}
diff --git a/src/zgerbt_gpu.cpp b/src/zgerbt_gpu.cpp
index 6c0e8f405..44a4ee40a 100644
--- a/src/zgerbt_gpu.cpp
+++ b/src/zgerbt_gpu.cpp
@@ -169,3 +169,77 @@ magma_zgerbt_gpu(
 
     return *info;
 }
+
+/***************************************************************************//**
+
+    @copydoc magma_zgerbt_gpu
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgerbt_gpu_async(
+        const magma_bool_t gen, const magma_int_t n, const magma_int_t nrhs,
+        magmaDoubleComplex_ptr const dA, magma_int_t const ldda,
+        magmaDoubleComplex_ptr const dB, magma_int_t const lddb,
+        magmaDoubleComplex_ptr const dU, magmaDoubleComplex_ptr const dV,
+        magma_int_t *info,
+        magma_queue_t queue)
+{
+#define dB(i_, j_) (dB + (i_) + (j_)*lddb)
+
+    /* Function Body */
+    *info = 0;
+    if ( ! (gen == MagmaTrue) &&
+         ! (gen == MagmaFalse) ) {
+        *info = -1;
+    }
+    else if (n < 0) {
+        *info = -2;
+    } else if (nrhs < 0) {
+        *info = -3;
+    } else if (ldda < max(1,n)) {
+        *info = -5;
+    } else if (lddb < max(1,n)) {
+        *info = -7;
+    }
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (nrhs == 0 || n == 0)
+        return *info;
+
+    magmaDoubleComplex *U, *V;
+    if (MAGMA_SUCCESS != magma_zmalloc_cpu( &U, 2*n ) ||
+        MAGMA_SUCCESS != magma_zmalloc_cpu( &V, 2*n ))
+    {
+        *info = MAGMA_ERR_HOST_ALLOC;
+        goto cleanup;
+    }
+
+    /* Initialize Butterfly matrix on the CPU */
+    if (gen == MagmaTrue)
+        init_butterfly( 2*n, U, V );
+
+    /* Copy the butterfly to the GPU */
+    magma_zsetvector_async( 2*n, U, 1, dU, 1, queue );
+    magma_zsetvector_async( 2*n, V, 1, dV, 1, queue );
+
+    /* Perform Partial Random Butterfly Transformation on the GPU */
+    magmablas_zprbt( n, dA, ldda, dU, dV, queue );
+
+    /* Compute U^T * b on the GPU*/
+    magmablas_zprbt_mtv(n, nrhs, dU, dB, lddb, queue);
+
+    cleanup:
+    magma_free_cpu( U );
+    magma_free_cpu( V );
+
+    return *info;
+}
diff --git a/src/zgerfs_nopiv_gpu.cpp b/src/zgerfs_nopiv_gpu.cpp
index 2c2f9c04a..0a2e0152b 100644
--- a/src/zgerfs_nopiv_gpu.cpp
+++ b/src/zgerfs_nopiv_gpu.cpp
@@ -8,6 +8,7 @@
        @precisions normal z -> s d c
 
 */
+#include <limits>
 #include "magma_internal.h"
 
 #define BWDMAX 1.0
@@ -275,3 +276,212 @@ magma_zgerfs_nopiv_gpu(
     
     return *info;
 }
+
+/***************************************************************************//**
+
+    @copydoc magma_zgerfs_nopiv_gpu
+
+    @param[in]
+    iter_max INTEGER
+            The maximum number of refinement iterations performed.
+
+    @param[in]
+    bwdmax  DOUBLE
+            Refine the solution if the error is above the threshold multiplied by
+            bwdmax. See @magma_zgerfs_nopiv_gpu for how error threshold is calculated.
+            Set to zero to perform all iterations specified in iter_max.
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgerfs_nopiv_gpu_async(
+        magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magmaDoubleComplex_ptr dX, magma_int_t lddx,
+        magmaDoubleComplex_ptr dworkd, magmaDoubleComplex_ptr dAF,
+        magma_int_t *iter,
+        magma_int_t *info,
+        magma_int_t iter_max,
+        double bwdmax,
+        magma_queue_t queue)
+{
+#define dB(i,j)     (dB + (i) + (j)*lddb)
+#define dX(i,j)     (dX + (i) + (j)*lddx)
+#define dR(i,j)     (dR + (i) + (j)*lddr)
+
+    /* Constants */
+    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
+    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
+    constexpr magma_int_t ione = 1;
+    const auto n_elem = n * nrhs;
+
+    /* Local variables */
+    magmaDoubleComplex_ptr dR;
+    magmaDoubleComplex Xnrmv, Rnrmv;
+    double Anrm, Xnrm, Rnrm, cte, eps, work[1];
+    double sae, best_sae = std::numeric_limits<double>::infinity();
+    magma_int_t i, j, iiter, lddsa, lddr;
+    magmaDoubleComplex_ptr best_dX = nullptr;
+
+    /* Check arguments */
+    *iter = 0;
+    *info = 0;
+    if ( n < 0 )
+        *info = -1;
+    else if ( nrhs < 0 )
+        *info = -2;
+    else if ( ldda < max(1,n))
+        *info = -4;
+    else if ( lddb < max(1,n))
+        *info = -8;
+    else if ( lddx < max(1,n))
+        *info = -10;
+
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    if ( n == 0 || nrhs == 0 )
+        return *info;
+
+    lddsa = n;
+    lddr  = n;
+
+    dR  = dworkd;
+
+    eps  = lapackf77_dlamch("Epsilon");
+    if ( bwdmax == 0 ) {
+        Anrm = cte = 0;
+    } else {
+        Anrm = magmablas_zlange(MagmaInfNorm, n, n, dA, ldda, (magmaDouble_ptr) dworkd, n_elem, queue);
+        cte = Anrm * eps * magma_dsqrt((double) n) * bwdmax;
+    }
+
+    // residual dR = dB - dA*dX in double precision
+    magmablas_zlacpy( MagmaFull, n, nrhs, dB, lddb, dR, lddr, queue );
+    if ( nrhs == 1 ) {
+        magma_zgemv( trans, n, n,
+                     c_neg_one, dA, ldda,
+                     dX, 1,
+                     c_one,     dR, 1, queue );
+    }
+    else {
+        magma_zgemm( trans, MagmaNoTrans, n, nrhs, n,
+                     c_neg_one, dA, ldda,
+                     dX, lddx,
+                     c_one,     dR, lddr, queue );
+    }
+
+    if ( bwdmax == 0 ) goto refinement;
+
+    // TODO: use MAGMA_Z_ABS( dX(i,j) ) instead of zlange?
+    // TODO implement Xamax for one GPU copy less
+    for( j=0; j < nrhs; j++ ) {
+        i = magma_izamax( n, dX(0,j), 1, queue ) - 1;
+        magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue );
+        Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, work );
+
+        i = magma_izamax( n, dR(0,j), 1, queue ) - 1;
+        magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue );
+        Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, work );
+        //printf("Rnrm : %e, Xnrm*cte : %e\n", Rnrm, Xnrm*cte);
+        if ( Rnrm >  Xnrm*cte ) {
+            goto refinement;
+        }
+    }
+
+    *iter = 0;
+    goto cleanup;
+
+    refinement:
+    magma_zmalloc_async( &best_dX, n_elem, queue );
+    for( iiter=1; iiter < iter_max; ) {
+        *info = 0;
+        // solve dAF*dX = dR
+        // it's okay that dR is used for both dB input and dX output.
+        magma_zgetrs_nopiv_gpu_async( trans, n, nrhs, dAF, lddsa, dR, lddr, info, queue );
+        if (*info != 0) {
+            *iter = -3;
+            goto fallback;
+        }
+
+        // Add correction and setup residual
+        // dX += dR  --and--
+        // dR = dB
+        // This saves going through dR a second time (if done with one more kernel).
+        // -- not really: first time is read, second time is write.
+        for( j=0; j < nrhs; ++j ) {
+            magmablas_zaxpycp( n, dR(0,j), dX(0,j), dB(0,j), queue );
+        }
+
+        // residual dR = dB - dA*dX in double precision
+        if ( nrhs == 1 ) {
+            magma_zgemv( trans, n, n,
+                         c_neg_one, dA, ldda,
+                         dX, 1,
+                         c_one,     dR, 1, queue );
+        }
+        else {
+            magma_zgemm( trans, MagmaNoTrans, n, nrhs, n,
+                         c_neg_one, dA, ldda,
+                         dX, lddx,
+                         c_one,     dR, lddr, queue );
+        }
+
+        /*  Sum of absolute error is compared between each iteration
+         *  and the solution with best residuals copied on the side. */
+        sae = magma_dzasum(n_elem, dR, 1, queue);
+        if (sae < best_sae) {
+            best_sae = sae;
+            magma_zcopymatrix_async(n, nrhs, dX, n, best_dX, n, queue);
+        }
+        if (bwdmax == 0) goto L20;
+
+        /*  Check whether the nrhs normwise backward errors satisfy the
+         *  stopping criterion. If yes, set ITER=IITER > 0 and return. */
+        for( j=0; j < nrhs; ++j ) {
+            i = magma_izamax( n, dX(0,j), 1, queue ) - 1;
+            magma_zgetmatrix( 1, 1, dX(i,j), 1, &Xnrmv, 1, queue );
+            Xnrm = lapackf77_zlange( "F", &ione, &ione, &Xnrmv, &ione, work );
+
+            i = magma_izamax( n, dR(0,j), 1, queue ) - 1;
+            magma_zgetmatrix( 1, 1, dR(i,j), 1, &Rnrmv, 1, queue );
+            Rnrm = lapackf77_zlange( "F", &ione, &ione, &Rnrmv, &ione, work );
+
+            if (  bwdmax == 0 || Rnrm >  Xnrm*cte ) {
+                goto L20;
+            }
+        }
+
+        /*  If we are here, the nrhs normwise backward errors satisfy
+         *  the stopping criterion, we are good to exit. */
+        *iter = iiter;
+        goto cleanup;
+
+        L20:
+        ++iiter;
+    }
+    /* If we are at this place of the code, this is because we have
+     * performed ITER=iter_max iterations and never satisified the
+     * stopping criterion. Set up the ITER flag accordingly. */
+    *iter = -iter_max - 1;
+
+    fallback:
+    /* Iterative refinement failed to converge to a
+     * satisfactory solution. */
+
+    cleanup:
+
+    if (best_dX) {
+        magma_zcopymatrix_async( n, nrhs, best_dX, n, dX, n, queue );
+        magma_free_async( best_dX, queue );
+    }
+
+    return *info;
+}
diff --git a/src/zgesv_nopiv_gpu.cpp b/src/zgesv_nopiv_gpu.cpp
index c142b7092..ba389c2ee 100644
--- a/src/zgesv_nopiv_gpu.cpp
+++ b/src/zgesv_nopiv_gpu.cpp
@@ -93,3 +93,48 @@ magma_zgesv_nopiv_gpu(
     
     return *info;
 }
+
+/***************************************************************************//**
+
+    @copydoc magma_zgesv_nopiv_gpu
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgesv_nopiv_gpu_async(
+        magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magma_int_t *info, magma_queue_t queue )
+{
+    *info = 0;
+    if (n < 0) {
+        *info = -1;
+    } else if (nrhs < 0) {
+        *info = -2;
+    } else if (ldda < max(1,n)) {
+        *info = -4;
+    } else if (lddb < max(1,n)) {
+        *info = -6;
+    }
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (n == 0 || nrhs == 0) {
+        return *info;
+    }
+
+    magma_zgetrf_nopiv_gpu_async( n, n, dA, ldda, info, queue );
+    if ( *info == MAGMA_SUCCESS ) {
+        magma_zgetrs_nopiv_gpu_async( MagmaNoTrans, n, nrhs, dA, ldda, dB, lddb, info, queue );
+    }
+
+    return *info;
+}
diff --git a/src/zgesv_rbt.cpp b/src/zgesv_rbt.cpp
index a5cd7620d..61055e814 100644
--- a/src/zgesv_rbt.cpp
+++ b/src/zgesv_rbt.cpp
@@ -201,3 +201,221 @@ magma_zgesv_rbt(
 
     return *info;
 }
+
+/***************************************************************************//**
+
+    @copydoc magma_zgetrf_nopiv_gpu
+
+    @param[in]
+    iter_max INTEGER
+            The maximum number of refinement iterations performed.
+
+    @param[in]
+    bwdmax  DOUBLE
+            Refine the solution if the error is above the threshold multiplied by
+            bwdmax. See @magma_zgerfs_nopiv_gpu for how error threshold is calculated.
+            Set to zero to perform all iterations specified in iter_max.
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgesv_rbt_async(
+        const magma_bool_t refine, const magma_int_t n, const magma_int_t nrhs,
+        const magmaDoubleComplex *const dA_, const magma_int_t lda,
+        magmaDoubleComplex *const dB_, const magma_int_t ldb,
+        magma_int_t *const info,
+        const magma_int_t iter_max, const double bwdmax,
+        magma_queue_t queue)
+{
+    /* Constants */
+    const magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
+    const magmaDoubleComplex c_one  = MAGMA_Z_ONE;
+
+    /* Local variables */
+    magma_int_t nn = n;//magma_roundup( n, 4 );
+    magmaDoubleComplex_ptr dA=NULL, dB=NULL, dAo=NULL, dBo=NULL, dwork=NULL, dU=NULL, dV=NULL;
+    magma_int_t iter;
+
+    /* Function Body */
+    *info = 0;
+    if ( ! (refine == MagmaTrue) &&
+         ! (refine == MagmaFalse) ) {
+        *info = -1;
+    }
+    else if (n < 0) {
+        *info = -2;
+    } else if (nrhs < 0) {
+        *info = -3;
+    } else if (lda < max(1,n)) {
+        *info = -5;
+    } else if (ldb < max(1,n)) {
+        *info = -7;
+    }
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (nrhs == 0 || n == 0)
+        return *info;
+
+    // TODO: investigate failures on AMD GPUs
+    // For now ignore refine and always set it to False
+    // there is probably a bug in the refinement code for the HIP backend
+    #ifdef MAGMA_HAVE_HIP
+    refine = MagmaFalse;
+    #endif
+
+    if (MAGMA_SUCCESS != magma_zmalloc_async( &dA, nn*nn, queue ) ||
+        MAGMA_SUCCESS != magma_zmalloc_async( &dB, nn*nrhs, queue ))
+    {
+        *info = MAGMA_ERR_DEVICE_ALLOC;
+        goto cleanup;
+    }
+
+    /* Allocate memory for the buterfly matrices */
+    if ((MAGMA_SUCCESS != magma_zmalloc_async( &dU, 2*n, queue )) ||
+        (MAGMA_SUCCESS != magma_zmalloc_async( &dV, 2*n, queue ))) {
+        magma_free_async( dU, queue );
+        magma_free_async( dV, queue );
+        *info = MAGMA_ERR_DEVICE_ALLOC;
+        return *info;
+    }
+
+    if (refine == MagmaTrue) {
+        if (MAGMA_SUCCESS != magma_zmalloc_async( &dAo,   nn*nn, queue ) ||
+            MAGMA_SUCCESS != magma_zmalloc_async( &dwork, nn*nrhs, queue ) ||
+            MAGMA_SUCCESS != magma_zmalloc_async( &dBo,   nn*nrhs, queue ))
+        {
+            *info = MAGMA_ERR_DEVICE_ALLOC;
+            goto cleanup;
+        }
+    }
+
+    magmablas_zlaset( MagmaFull, nn, nn, c_zero, c_one, dA, nn, queue );
+
+    magma_zcopymatrix_async( n, n, dA_, lda, dA, nn, queue );
+    magma_zcopymatrix_async( n, nrhs, dB_, ldb, dB, nn, queue );
+
+    *info = magma_zgerbt_gpu_async( MagmaTrue, nn, nrhs, dA, nn, dB, nn, dU, dV, info, queue );
+    if (*info != MAGMA_SUCCESS)  {
+        return *info;
+    }
+
+    if (refine == MagmaTrue) {
+        magma_zcopymatrix_async( nn, nn, dA, nn, dAo, nn, queue );
+        magma_zcopymatrix_async( nn, nrhs, dB, nn, dBo, nn, queue );
+    }
+    /* Solve the system U^TAV.y = U^T.b on the GPU */
+    magma_zgesv_nopiv_gpu_async( nn, nrhs, dA, nn, dB, nn, info, queue );
+
+    /* Iterative refinement */
+    if (refine == MagmaTrue) {
+        magma_zgerfs_nopiv_gpu_async( MagmaNoTrans, nn, nrhs, dAo, nn, dBo, nn, dB, nn, dwork, dA, &iter, info, iter_max, bwdmax, queue );
+    }
+
+    magmablas_zprbt_mv(nn, nrhs, dV, dB, nn, queue);
+
+    magma_zcopymatrix_async( n, nrhs, dB, nn, dB_, ldb, queue );
+
+    cleanup:
+    magma_free_async( dA, queue );
+    magma_free_async( dB, queue );
+    magma_free_async( dU, queue );
+    magma_free_async( dV, queue );
+
+    if (refine == MagmaTrue) {
+        magma_free_async( dAo, queue );
+        magma_free_async( dBo, queue );
+        magma_free_async( dwork, queue );
+    }
+
+    return *info;
+}
+
+extern "C" magma_int_t
+magma_zgesv_rbt_refine_async(
+        const magma_int_t n, const magma_int_t nrhs,
+        const magmaDoubleComplex *const dA_, const magma_int_t lda,
+        magmaDoubleComplex *const dB_, const magma_int_t ldb,
+        magma_int_t *info,
+        const magma_int_t iter_max, const double bwdmax,
+        magma_queue_t queue)
+{
+    /* Constants */
+    const magmaDoubleComplex c_zero = MAGMA_Z_ZERO;
+    const magmaDoubleComplex c_one  = MAGMA_Z_ONE;
+
+    /* Local variables */
+    magma_int_t nn = n;//magma_roundup( n, 4 );
+    magmaDoubleComplex_ptr dA=NULL, dB=NULL, dAo=NULL, dBo=NULL, dwork=NULL;
+    magma_int_t iter;
+
+    /* Function Body */
+    *info = 0;
+    if (n < 0) {
+        *info = -2;
+    } else if (nrhs < 0) {
+        *info = -3;
+    } else if (lda < max(1,n)) {
+        *info = -5;
+    } else if (ldb < max(1,n)) {
+        *info = -7;
+    }
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (nrhs == 0 || n == 0)
+        return *info;
+
+    // TODO: investigate failures on AMD GPUs
+    // For now ignore refine and always set it to False
+    // there is probably a bug in the refinement code for the HIP backend
+    #ifdef MAGMA_HAVE_HIP
+    *info = -8;
+    return *info;
+    #endif
+
+    if (MAGMA_SUCCESS != magma_zmalloc_async( &dA, nn*nn, queue ) ||
+        MAGMA_SUCCESS != magma_zmalloc_async( &dB, nn*nrhs, queue ))
+    {
+        *info = MAGMA_ERR_DEVICE_ALLOC;
+        goto cleanup;
+    }
+
+    if (MAGMA_SUCCESS != magma_zmalloc_async( &dAo,   nn*nn, queue ) ||
+        MAGMA_SUCCESS != magma_zmalloc_async( &dwork, nn*nrhs, queue ) ||
+        MAGMA_SUCCESS != magma_zmalloc_async( &dBo,   nn*nrhs, queue ))
+    {
+        *info = MAGMA_ERR_DEVICE_ALLOC;
+        goto cleanup;
+    }
+
+    magmablas_zlaset( MagmaFull, nn, nn, c_zero, c_one, dA, nn, queue );
+
+    magma_zcopymatrix_async( n, n, dA_, lda, dA, nn, queue );
+    magma_zcopymatrix_async( nn, nn, dA_, nn, dAo, nn, queue );
+    magma_zcopymatrix_async( n, nrhs, dB_, ldb, dB, nn, queue );
+    magma_zcopymatrix_async( nn, nrhs, dB_, nn, dBo, nn, queue );
+
+    magma_zgerfs_nopiv_gpu_async( MagmaNoTrans, nn, nrhs, dAo, nn, dBo, nn, dB, nn, dwork, dA, &iter, info, iter_max, bwdmax, queue );
+
+    magma_zcopymatrix_async( n, nrhs, dB, nn, dB_, ldb, queue );
+
+    cleanup:
+    magma_free_async( dA, queue );
+    magma_free_async( dB, queue );
+    magma_free_async( dAo, queue );
+    magma_free_async( dBo, queue );
+    magma_free_async( dwork, queue );
+
+    return *info;
+}
diff --git a/src/zgetrf_nopiv_gpu.cpp b/src/zgetrf_nopiv_gpu.cpp
index 472491e2f..7d20f3264 100644
--- a/src/zgetrf_nopiv_gpu.cpp
+++ b/src/zgetrf_nopiv_gpu.cpp
@@ -209,3 +209,164 @@ magma_zgetrf_nopiv_gpu(
     
     return *info;
 } /* magma_zgetrf_nopiv_gpu */
+
+
+/***************************************************************************//**
+
+    @copydoc magma_zgetrf_nopiv_gpu
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgetrf_nopiv_gpu_async(
+        magma_int_t m, magma_int_t n,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magma_int_t *info,
+        magma_queue_t queue)
+{
+#ifdef MAGMA_HAVE_OPENCL
+#define  dA(i_, j_) dA,  (dA_offset  + (i_)*nb       + (j_)*nb*ldda)
+#else
+#define  dA(i_, j_) (dA  + (i_)*nb       + (j_)*nb*ldda)
+#endif
+
+    const magmaDoubleComplex c_one     = MAGMA_Z_ONE;
+    const magmaDoubleComplex c_neg_one = MAGMA_Z_NEG_ONE;
+
+    magma_int_t iinfo, nb;
+    magma_int_t maxm, mindim;
+    magma_int_t j, rows, s, ldwork;
+    magmaDoubleComplex *work;
+
+    /* Check arguments */
+    *info = 0;
+    if (m < 0)
+        *info = -1;
+    else if (n < 0)
+        *info = -2;
+    else if (ldda < max(1,m))
+        *info = -4;
+
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (m == 0 || n == 0)
+        return *info;
+
+    /* Function Body */
+    mindim = min( m, n );
+    nb     = magma_get_zgetrf_nb( m, n );
+    s      = mindim / nb;
+
+    magma_queue_t queues[2];
+    queues[0] = queue;
+    magma_queue_create( queue->device(), &queues[1] );
+
+    if (nb <= 1 || nb >= min(m,n)) {
+        /* Use CPU code. */
+        if ( MAGMA_SUCCESS != magma_zmalloc_cpu( &work, m*n )) {
+            *info = MAGMA_ERR_HOST_ALLOC;
+            return *info;
+        }
+        magma_zgetmatrix( m, n, dA(0,0), ldda, work, m, queues[0] );
+        magma_zgetrf_nopiv( m, n, work, m, info );
+        magma_zsetmatrix( m, n, work, m, dA(0,0), ldda, queues[0] );
+        magma_free_cpu( work );
+    }
+    else {
+        /* Use hybrid blocked code. */
+        maxm = magma_roundup( m, 32 );
+
+        ldwork = maxm;
+        if (MAGMA_SUCCESS != magma_zmalloc_pinned( &work, ldwork*nb )) {
+            *info = MAGMA_ERR_HOST_ALLOC;
+            return *info;
+        }
+
+        for( j=0; j < s; j++ ) {
+            // get j-th panel from device
+            magma_queue_sync( queues[1] );
+            magma_zgetmatrix_async( m-j*nb, nb, dA(j,j), ldda, work, ldwork, queues[0] );
+
+            if ( j > 0 ) {
+                magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
+                             nb, n - (j+1)*nb,
+                             c_one, dA(j-1,j-1), ldda,
+                             dA(j-1,j+1), ldda, queues[1] );
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
+                             m-j*nb, n-(j+1)*nb, nb,
+                             c_neg_one, dA(j,  j-1), ldda,
+                             dA(j-1,j+1), ldda,
+                             c_one,     dA(j,  j+1), ldda, queues[1] );
+            }
+
+            // do the cpu part
+            rows = m - j*nb;
+            magma_queue_sync( queues[0] );
+            magma_zgetrf_nopiv( rows, nb, work, ldwork, &iinfo );
+            if ( *info == 0 && iinfo > 0 )
+                *info = iinfo + j*nb;
+
+            // send j-th panel to device
+            magma_zsetmatrix_async( m-j*nb, nb, work, ldwork, dA(j, j), ldda, queues[1] );
+
+            // do the small non-parallel computations (next panel update)
+            if ( s > j+1 ) {
+                magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
+                             nb, nb,
+                             c_one, dA(j, j), ldda,
+                             dA(j, j+1), ldda, queues[1] );
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
+                             m-(j+1)*nb, nb, nb,
+                             c_neg_one, dA(j+1, j), ldda,
+                             dA(j,   j+1), ldda,
+                             c_one,     dA(j+1, j+1), ldda, queues[1] );
+            }
+            else {
+                magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
+                             nb, n-s*nb,
+                             c_one, dA(j, j  ), ldda,
+                             dA(j, j+1), ldda, queues[1] );
+                magma_zgemm( MagmaNoTrans, MagmaNoTrans,
+                             m-(j+1)*nb, n-(j+1)*nb, nb,
+                             c_neg_one, dA(j+1, j  ), ldda,
+                             dA(j,   j+1), ldda,
+                             c_one,     dA(j+1, j+1), ldda, queues[1] );
+            }
+        }
+
+        magma_int_t nb0 = min( m - s*nb, n - s*nb );
+        if ( nb0 > 0 ) {
+            rows = m - s*nb;
+
+            magma_zgetmatrix( rows, nb0, dA(s,s), ldda, work, ldwork, queues[1] );
+
+            // do the cpu part
+            magma_zgetrf_nopiv( rows, nb0, work, ldwork, &iinfo );
+            if ( *info == 0 && iinfo > 0 )
+                *info = iinfo + s*nb;
+
+            // send j-th panel to device
+            magma_zsetmatrix( rows, nb0, work, ldwork, dA(s,s), ldda, queues[1] );
+
+            magmablas_ztrsm_async( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,
+                         nb0, n-s*nb-nb0,
+                         c_one, dA(s,s),     ldda,
+                         dA(s,s)+nb0, ldda, queues[1] );
+        }
+        magma_queue_sync( queues[1] );
+
+        magma_free_pinned( work );
+    }
+
+    magma_queue_destroy( queues[1] );
+
+    return *info;
+} /* magma_zgetrf_nopiv_gpu_async */
diff --git a/src/zgetrs_nopiv_gpu.cpp b/src/zgetrs_nopiv_gpu.cpp
index bd9dea284..9eef098df 100644
--- a/src/zgetrs_nopiv_gpu.cpp
+++ b/src/zgetrs_nopiv_gpu.cpp
@@ -130,3 +130,73 @@ magma_zgetrs_nopiv_gpu(
     
     return *info;
 }
+
+/***************************************************************************//**
+
+    @copydoc magma_zgetrs_nopiv_gpu
+
+    @param[in]
+    queue   magma_queue_t
+            A pointer to a magma_queue structure that will be used for the 
+            execution of this method, and all methods it calls.  queue != nullptr
+
+*******************************************************************************/
+extern "C" magma_int_t
+magma_zgetrs_nopiv_gpu_async(
+        magma_trans_t trans, magma_int_t n, magma_int_t nrhs,
+        magmaDoubleComplex_ptr dA, magma_int_t ldda,
+        magmaDoubleComplex_ptr dB, magma_int_t lddb,
+        magma_int_t *info, magma_queue_t queue)
+{
+    // Constants
+    const magmaDoubleComplex c_one = MAGMA_Z_ONE;
+
+    // Local variables
+    const bool notran = trans == MagmaNoTrans;
+
+    *info = 0;
+    if ( (! notran) &&
+         (trans != MagmaTrans) &&
+         (trans != MagmaConjTrans) ) {
+        *info = -1;
+    } else if (n < 0) {
+        *info = -2;
+    } else if (nrhs < 0) {
+        *info = -3;
+    } else if (ldda < max(1,n)) {
+        *info = -5;
+    } else if (lddb < max(1,n)) {
+        *info = -7;
+    }
+    if (*info != 0) {
+        magma_xerbla( __func__, -(*info) );
+        return *info;
+    }
+
+    /* Quick return if possible */
+    if (n == 0 || nrhs == 0) {
+        return *info;
+    }
+
+    if (notran) {
+        /* Solve A * X = B. */
+        if ( nrhs == 1) {
+            magma_ztrsv( MagmaLower, MagmaNoTrans, MagmaUnit,    n, dA, ldda, dB, 1, queue );
+            magma_ztrsv( MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, dA, ldda, dB, 1, queue );
+        } else {
+            magma_ztrsm( MagmaLeft, MagmaLower, MagmaNoTrans, MagmaUnit,    n, nrhs, c_one, dA, ldda, dB, lddb, queue );
+            magma_ztrsm( MagmaLeft, MagmaUpper, MagmaNoTrans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue );
+        }
+    } else {
+        /* Solve A**T * X = B  or  A**H * X = B. */
+        if ( nrhs == 1) {
+            magma_ztrsv( MagmaUpper, trans, MagmaNonUnit, n, dA, ldda, dB, 1, queue );
+            magma_ztrsv( MagmaLower, trans, MagmaUnit,    n, dA, ldda, dB, 1, queue );
+        } else {
+            magma_ztrsm( MagmaLeft, MagmaUpper, trans, MagmaNonUnit, n, nrhs, c_one, dA, ldda, dB, lddb, queue );
+            magma_ztrsm( MagmaLeft, MagmaLower, trans, MagmaUnit,    n, nrhs, c_one, dA, ldda, dB, lddb, queue );
+        }
+    }
+
+    return *info;
+}