diff --git a/Source/jit_kernels/include/GB_jit_kernel_proto.h b/Source/jit_kernels/include/GB_jit_kernel_proto.h index 003e9a578f..37dbe10f33 100644 --- a/Source/jit_kernels/include/GB_jit_kernel_proto.h +++ b/Source/jit_kernels/include/GB_jit_kernel_proto.h @@ -727,7 +727,12 @@ GrB_Info GB_jit_kernel_kroner \ ( \ GrB_Matrix C, \ const GrB_Matrix A, \ + const bool A_transpose, \ const GrB_Matrix B, \ + const bool B_transpose, \ + const GrB_Matrix Mask, \ + const bool Mask_struct, \ + const bool Mask_comp, \ const int nthreads, \ const void *theta, \ const GB_callback_struct *restrict my_callback \ diff --git a/Source/jit_wrappers/GB_kroner_jit.c b/Source/jit_wrappers/GB_kroner_jit.c index 174ec0e37e..1919b47f7d 100644 --- a/Source/jit_wrappers/GB_kroner_jit.c +++ b/Source/jit_wrappers/GB_kroner_jit.c @@ -20,7 +20,12 @@ GrB_Info GB_kroner_jit const GrB_BinaryOp binaryop, const bool flipij, const GrB_Matrix A, + const bool A_transpose, const GrB_Matrix B, + const bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_struct, + const bool Mask_comp, const int nthreads ) { @@ -41,7 +46,7 @@ GrB_Info GB_kroner_jit GB_JIT_KERNEL_KRONER, /* is_ewisemult: */ false, /* C_iso: */ C->iso, /* C_in_iso: */ false, C_sparsity, C->type, C->p_is_32, C->j_is_32, C->i_is_32, - /* M: */ NULL, true, false, binaryop, flipij, false, A, B) ; + /* M: */ Mask, Mask_struct, Mask_comp, binaryop, flipij, false, A, B) ; //-------------------------------------------------------------------------- // get the kernel function pointer, loading or compiling it if needed @@ -60,6 +65,6 @@ GrB_Info GB_kroner_jit #include "include/GB_pedantic_disable.h" GB_jit_dl_function GB_jit_kernel = (GB_jit_dl_function) dl_function ; - return (GB_jit_kernel (C, A, B, nthreads, binaryop->theta, &GB_callback)) ; + return (GB_jit_kernel (C, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads, binaryop->theta, &GB_callback)) ; } diff --git a/Source/jitifyer/GB_stringify.h b/Source/jitifyer/GB_stringify.h index df349a87ca..e2b0a0b46a 100644 --- a/Source/jitifyer/GB_stringify.h +++ b/Source/jitifyer/GB_stringify.h @@ -1855,7 +1855,12 @@ GrB_Info GB_kroner_jit const GrB_BinaryOp binaryop, const bool flipij, const GrB_Matrix A, + const bool A_transpose, const GrB_Matrix B, + const bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_struct, + const bool Mask_comp, const int nthreads ) ; diff --git a/Source/kronecker/GB_kron.c b/Source/kronecker/GB_kron.c index 5c462da512..d3f61a6d5b 100644 --- a/Source/kronecker/GB_kron.c +++ b/Source/kronecker/GB_kron.c @@ -23,16 +23,76 @@ GB_Matrix_free (&T) ; \ } +#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p]) + +#define GBB(Ab,p) ((Ab == NULL) ? 1 : Ab [p]) + +#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k]) + +#define GBH(Ah,k) ((Ah == NULL) ? (k) : Ah [k]) + #include "kronecker/GB_kron.h" #include "mxm/GB_mxm.h" #include "transpose/GB_transpose.h" #include "mask/GB_accum_mask.h" +static bool GB_lookup_xoffset ( + GrB_Index *p, + GrB_Matrix A, + GrB_Index row, + GrB_Index col +) +{ + GrB_Index vector = A->is_csc ? col : row ; + GrB_Index coord = A->is_csc ? row : col ; + + if (A->p == NULL) + { + GrB_Index offset = vector * A->vlen + coord ; + if (A->b == NULL || ((int8_t *)A->b)[offset]) + { + *p = A->iso ? 0 : offset ; + return true ; + } + return false ; + } + + int64_t start, end ; + bool res ; + + if (A->h == NULL) + { + start = A->p_is_32 ? ((uint32_t *)A->p)[vector] : ((uint64_t *)A->p)[vector] ; + end = A->p_is_32 ? ((uint32_t *)A->p)[vector + 1] : ((uint64_t *)A->p)[vector + 1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } + else + { + start = 0 ; end = A->plen - 1 ; + res = GB_binary_search(vector, A->h, A->j_is_32, &start, &end) ; + if (!res) return false ; + int64_t k = start ; + start = A->p_is_32 ? ((uint32_t *)A->p)[k] : ((uint64_t *)A->p)[k] ; + end = A->p_is_32 ? ((uint32_t *)A->p)[k+1] : ((uint64_t *)A->p)[k+1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search(coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } +} + +#include "emult/GB_emult.h" + GrB_Info GB_kron // C = accum (C, kron(A,B)) ( GrB_Matrix C, // input/output matrix for results const bool C_replace, // if true, clear C before writing to it - const GrB_Matrix M, // optional mask for C, unused if NULL + const GrB_Matrix Mask, // optional mask for C, unused if NULL const bool Mask_comp, // if true, use !M const bool Mask_struct, // if true, use the only structure of M const GrB_BinaryOp accum, // optional accum for Z=accum(C,T) @@ -51,6 +111,8 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) // C may be aliased with M, A, and/or B + GrB_Matrix M = Mask ; + GrB_Info info ; struct GB_Matrix_opaque T_header, AT_header, BT_header ; GrB_Matrix T = NULL, AT = NULL, BT = NULL ; @@ -104,6 +166,80 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) // quick return if an empty mask is complemented GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp, Mask_struct) ; + // check if it's possible to apply mask immediately in kron + + bool Mask_is_applicable = M != NULL && !Mask_comp ; + if (Mask_is_applicable) { + bool MT_hypersparse = (A->h != NULL) || (B->h != NULL) ; + size_t allocated = 0 ; + + GrB_Matrix MT = NULL ; struct GB_Matrix_opaque MT_header ; + GB_CLEAR_MATRIX_HEADER (MT, &MT_header) ; + + bool A_is_pattern, B_is_pattern ; + GB_binop_pattern (&A_is_pattern, &B_is_pattern, false, op->opcode) ; + + GrB_Info masked_kroner_info = GB_kroner (MT, C->is_csc, op, false, A, A_is_pattern, A_transpose, B, B_is_pattern, B_transpose, + M, Mask_comp, Mask_struct, Werk) ; + if (masked_kroner_info != GrB_SUCCESS) + { + return masked_kroner_info ; + } + + if (MT->is_csc != C->is_csc) { + GrB_Info MTtranspose = GB_transpose_in_place (MT, true, Werk) ; + if (MTtranspose != GrB_SUCCESS) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return MTtranspose ; + } + } + + if (MT_hypersparse) + { + uint32_t *MTh32 = NULL ; uint64_t *MTh64 = NULL ; + if (MT->j_is_32) + { + MTh32 = GB_malloc_memory (MT->vdim, sizeof(uint32_t), &allocated) ; + } + else + { + MTh64 = GB_malloc_memory (MT->vdim, sizeof(uint64_t), &allocated) ; + } + + if (MTh32 == NULL && MTh64 == NULL) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return GrB_OUT_OF_MEMORY ; + } + + double work = M->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int masked_hyper_threads = GB_nthreads (work, chunk, nthreads_max) ; + + #pragma omp parallel for num_threads(masked_hyper_threads) schedule(static) + for (GrB_Index i = 0; i < MT->vdim; i++) + { + if (MT->j_is_32) { MTh32[i] = i ; } else { MTh64[i] = i ; } + } + + MT->h = MTh32 ? (void *)MTh32 : (void *)MTh64 ; + + GrB_Info MThyperprune = GB_hyper_prune (MT, Werk) ; + if (MThyperprune != GrB_SUCCESS) + { + GB_FREE_WORKSPACE ; + GB_Matrix_free (&MT) ; + return MThyperprune ; + } + } + + return (GB_accum_mask (C, NULL, NULL, accum, &MT, C_replace, Mask_comp, Mask_struct, Werk)) ; + } + //-------------------------------------------------------------------------- // transpose A and B if requested //-------------------------------------------------------------------------- @@ -152,8 +288,8 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) GB_CLEAR_MATRIX_HEADER (T, &T_header) ; GB_OK (GB_kroner (T, T_is_csc, op, flipij, - A_transpose ? AT : A, A_is_pattern, - B_transpose ? BT : B, B_is_pattern, Werk)) ; + A_transpose ? AT : A, A_is_pattern, A_transpose, + B_transpose ? BT : B, B_is_pattern, B_transpose, M, Mask_comp, Mask_struct, Werk)) ; GB_FREE_WORKSPACE ; ASSERT_MATRIX_OK (T, "T = kron(A,B)", GB0) ; diff --git a/Source/kronecker/GB_kron.h b/Source/kronecker/GB_kron.h index a6c1a3f3ae..a25b0cfa15 100644 --- a/Source/kronecker/GB_kron.h +++ b/Source/kronecker/GB_kron.h @@ -15,7 +15,7 @@ GrB_Info GB_kron // C = accum (C, kron(A,B)) ( GrB_Matrix C, // input/output matrix for results const bool C_replace, // if true, clear C before writing to it - const GrB_Matrix M, // optional mask for C, unused if NULL + const GrB_Matrix Mask, // optional mask for C, unused if NULL const bool Mask_comp, // if true, use !M const bool Mask_struct, // if true, use the only structure of M const GrB_BinaryOp accum, // optional accum for Z=accum(C,T) @@ -35,8 +35,13 @@ GrB_Info GB_kroner // C = kron (A,B) const bool flipij, // if true, i and j are flipped: z=(x,y,j,i) const GrB_Matrix A, // input matrix bool A_is_pattern, // true if values of A are not used + bool A_transpose, const GrB_Matrix B, // input matrix bool B_is_pattern, // true if values of B are not used + bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, GB_Werk Werk ) ; diff --git a/Source/kronecker/GB_kroner.c b/Source/kronecker/GB_kroner.c index afbb655ee4..4843bb09ff 100644 --- a/Source/kronecker/GB_kroner.c +++ b/Source/kronecker/GB_kroner.c @@ -24,12 +24,187 @@ GB_phybix_free (C) ; \ } +#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p]) +#define GBB(Ab,p) ((Ab == NULL) ? 1 : Ab [p]) +#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k]) +#define GBH(Ah,k) ((Ah == NULL) ? (k) : Ah [k]) + +#include "mxm/GB_mxm.h" #include "kronecker/GB_kron.h" #include "emult/GB_emult.h" #include "slice/include/GB_search_for_vector.h" #include "jitifyer/GB_stringify.h" -GrB_Info GB_kroner // C = kron (A,B) +//------------------------------------------------------------------------------ +// GB_lookup_xoffset: find the offset of (row,col) in a Ax if present +//------------------------------------------------------------------------------ + +static bool GB_lookup_xoffset +( + GrB_Index *p, + GrB_Matrix A, + GrB_Index row, + GrB_Index col +) +{ + GrB_Index vector = A->is_csc ? col : row ; + GrB_Index coord = A->is_csc ? row : col ; + + if (A->p == NULL) + { + GrB_Index offset = vector * A->vlen + coord ; + if (A->b == NULL || ((int8_t *) A->b) [offset]) + { + *p = A->iso ? 0 : offset ; + return true ; + } + return false ; + } + + int64_t start, end ; + bool res ; + + if (A->h == NULL) + { + start = A->p_is_32 ? ((uint32_t *) A->p) [vector] + : ((uint64_t *) A->p) [vector] ; + end = A->p_is_32 ? ((uint32_t *) A->p) [vector + 1] + : ((uint64_t *) A->p) [vector + 1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } + else + { + start = 0 ; end = A->plen - 1 ; + res = GB_binary_search (vector, A->h, A->j_is_32, &start, &end) ; + if (!res) return false ; + int64_t k = start ; + start = A->p_is_32 ? ((uint32_t *) A->p) [k] + : ((uint64_t *) A->p) [k] ; + end = A->p_is_32 ? ((uint32_t *) A->p) [k + 1] + : ((uint64_t *) A->p) [k + 1] ; + end-- ; + if (start > end) return false ; + res = GB_binary_search (coord, A->i, A->i_is_32, &start, &end) ; + if (res) { *p = A->iso ? 0 : start ; } + return res ; + } +} + +//------------------------------------------------------------------------------ +// GB_kroner_generic: generic kernel for masked and default paths +//------------------------------------------------------------------------------ + +static GrB_Info GB_kroner_generic +( + GrB_Matrix C, + const GrB_BinaryOp op, + const bool flipij, + const GrB_Matrix A, + const GrB_Matrix B, + bool A_is_pattern, + bool B_is_pattern, + bool A_transpose, + bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, + const bool C_is_full, + const int nthreads +) +{ + GrB_Type ctype = op->ztype ; + const size_t csize = ctype->size ; + + GB_Ap_DECLARE (Ap, const) ; GB_Ap_PTR (Ap, A) ; + GB_Ah_DECLARE (Ah, const) ; GB_Ah_PTR (Ah, A) ; + const int64_t avlen = A->vlen ; + + GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ; + GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ; + const int64_t bvlen = B->vlen ; + const int64_t bnvec = B->nvec ; + + GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ; + const int64_t cnvec = C->nvec ; + const int64_t cvlen = C->vlen ; + const int64_t cnz = C->nvals ; + + #define GB_NO_MASK (Mask == NULL) + #define GB_MASK_STRUCT Mask_struct + #define GB_MASK_COMP Mask_comp + #define GB_Cp_IS_32 C->p_is_32 + #define GB_A_TYPE GB_void + #define GB_B_TYPE GB_void + #define GB_C_TYPE GB_void + #define GB_A_ISO A_iso + #define GB_B_ISO B_iso + #define GB_C_ISO C_iso + #define GB_C_IS_FULL C_is_full + + const bool A_iso = A->iso ; + const bool B_iso = B->iso ; + const bool C_iso = C->iso ; + const int64_t asize = A->type->size ; + const int64_t bsize = B->type->size ; + + GxB_binary_function fmult = op->binop_function ; + GxB_index_binary_function fmult_idx = op->idxbinop_function ; + const void *theta = op->theta ; + + GB_cast_function cast_A = NULL, cast_B = NULL ; + if (!A_is_pattern) + cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; + if (!B_is_pattern) + cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; + + #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)] + #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)] + + #define GB_GETA(a,Ax,p,iso) \ + { \ + if (!A_is_pattern) \ + cast_A (a, Ax + (p)*asize, asize) ; \ + } + + #define GB_GETB(b,Bx,p,iso) \ + { \ + if (!B_is_pattern) \ + cast_B (b, Bx + (p)*bsize, bsize) ; \ + } + + #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy) \ + { \ + if (fmult != NULL) \ + { \ + fmult (Cx + (pC)*csize, a, b) ; \ + } \ + else \ + { \ + if (flipij) \ + fmult_idx (Cx + (pC)*csize, \ + a, jx, ix, b, jy, iy, theta) ; \ + else \ + fmult_idx (Cx + (pC)*csize, \ + a, ix, jx, b, iy, jy, theta) ; \ + } \ + } + + #define GB_GENERIC + #include "ewise/include/GB_ewise_shared_definitions.h" + #include "kronecker/template/GB_kroner_template.c" + + return GrB_SUCCESS ; +} + +//------------------------------------------------------------------------------ +// GB_kroner: Kronecker product, C = kron (A,B) +//------------------------------------------------------------------------------ + +GrB_Info GB_kroner ( GrB_Matrix C, // output matrix const bool C_is_csc, // desired format of C @@ -37,8 +212,13 @@ GrB_Info GB_kroner // C = kron (A,B) const bool flipij, // if true, i and j are flipped: z=(x,y,j,i) const GrB_Matrix A_in, // input matrix bool A_is_pattern, // true if values of A are not used + bool A_transpose, const GrB_Matrix B_in, // input matrix bool B_is_pattern, // true if values of B are not used + bool B_transpose, + const GrB_Matrix Mask, + const bool Mask_comp, + const bool Mask_struct, GB_Werk Werk ) { @@ -65,12 +245,304 @@ GrB_Info GB_kroner // C = kron (A,B) GB_MATRIX_WAIT (B_in) ; //-------------------------------------------------------------------------- - // bitmap case: create sparse copies of A and B if they are bitmap + // common definitions for masked and default //-------------------------------------------------------------------------- + GrB_Type ctype = op->ztype ; + const size_t csize = ctype->size ; + GB_void cscalar [GB_VLA(csize)] ; + bool C_iso = GB_emult_iso (cscalar, ctype, A_in, B_in, op) ; + GrB_Matrix A = A_in ; + GrB_Matrix B = B_in ; + + //-------------------------------------------------------------------------- + // apply mask immediately if possible + //-------------------------------------------------------------------------- + + if (Mask != NULL && !Mask_comp) + { + GB_MATRIX_WAIT (Mask) ; + + int64_t bnrows = B_transpose ? GB_NCOLS (B) : GB_NROWS (B) ; + int64_t bncols = B_transpose ? GB_NROWS (B) : GB_NCOLS (B) ; + size_t allocated = 0 ; + int64_t centries = 0 ; + uint64_t nvecs = 0 ; + + //---------------------------------------------------------------------- + // allocate Cp + //---------------------------------------------------------------------- + + int64_t mnzmax = GB_nnz_max (Mask) ; + bool Cp_is_32, Cj_is_32, Ci_is_32; + GB_determine_pji_is_32 (&Cp_is_32, &Cj_is_32, &Ci_is_32, + GxB_SPARSE, mnzmax, (int64_t) Mask->vlen, (int64_t) Mask->vdim, Werk) ; + + uint32_t *Cp32 = NULL ; uint64_t *Cp64 = NULL ; + if (Cp_is_32) + Cp32 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint32_t), + &allocated) ; + else + Cp64 = GB_calloc_memory (Mask->vdim + 1, sizeof (uint64_t), + &allocated) ; + + if (Cp32 == NULL && Cp64 == NULL) + { + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + + GB_Mp_DECLARE (Mp, ) ; GB_Mp_PTR (Mp, Mask) ; + GB_Mh_DECLARE (Mh, ) ; GB_Mh_PTR (Mh, Mask) ; + GB_Mi_DECLARE (Mi, ) ; GB_Mi_PTR (Mi, Mask) ; + + GB_cast_function cast_A = GB_cast_factory (op->xtype->code, + A->type->code) ; + GB_cast_function cast_B = GB_cast_factory (op->ytype->code, + B->type->code) ; + + double work = Mask->vdim ; + int nthreads_max = GB_Context_nthreads_max ( ) ; + double chunk = GB_Context_chunk ( ) ; + int nthreads = GB_nthreads (work, chunk, nthreads_max) ; + + int64_t vlen = Mask->vlen ; + + //---------------------------------------------------------------------- + // count entries per vector + //---------------------------------------------------------------------- + + #pragma omp parallel num_threads(nthreads) + { + GrB_Index offset ; + + #pragma omp for reduction(+:nvecs) schedule(static) + for (GrB_Index k = 0 ; k < Mask->nvec ; k++) + { + GrB_Index j = Mh32 ? GBH (Mh32, k) : GBH (Mh64, k) ; + + int64_t pM_start = Mp32 ? GBP (Mp32, k, vlen) + : GBP (Mp64, k, vlen) ; + int64_t pM_end = Mp32 ? GBP (Mp32, k+1, vlen) + : GBP (Mp64, k+1, vlen) ; + bool nonempty = false ; + + for (GrB_Index p = pM_start ; p < pM_end ; p++) + { + if (!GBB (Mask->b, p)) continue ; + + int64_t i = Mi32 ? GBI (Mi32, p, vlen) + : GBI (Mi64, p, vlen) ; + GrB_Index Mrow = Mask->is_csc ? i : j ; + GrB_Index Mcol = Mask->is_csc ? j : i ; + + if (Mask_struct || (Mask->iso ? ((bool *) Mask->x) [0] + : ((bool *) Mask->x) [p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) + : (Mrow / bnrows) ; + GrB_Index acol = A_transpose ? (Mrow / bnrows) + : (Mcol / bncols) ; + GrB_Index brow = B_transpose ? (Mcol % bncols) + : (Mrow % bnrows) ; + GrB_Index bcol = B_transpose ? (Mrow % bnrows) + : (Mcol % bncols) ; + + if (!GB_lookup_xoffset (&offset, A, arow, acol)) + continue ; + if (!GB_lookup_xoffset (&offset, B, brow, bcol)) + continue ; + + if (Cp_is_32) + { + (Cp32 [j])++ ; + } + else + { + (Cp64 [j])++ ; + } + nonempty = true ; + } + } + if (nonempty) nvecs++ ; + } + } + + //---------------------------------------------------------------------- + // prefix sum to get centries + //---------------------------------------------------------------------- + + if (Cp_is_32) + GB_cumsum (Cp32, Cp_is_32, Mask->vdim, NULL, nthreads, Werk) ; + else + GB_cumsum (Cp64, Cp_is_32, Mask->vdim, NULL, nthreads, Werk) ; + + centries = Cp_is_32 ? (int64_t) Cp32 [Mask->vdim] + : (int64_t) Cp64 [Mask->vdim] ; + + //---------------------------------------------------------------------- + // allocate Ci + //---------------------------------------------------------------------- + + uint32_t *Ci32 = NULL ; uint64_t *Ci64 = NULL ; + if (Ci_is_32) + Ci32 = GB_malloc_memory (centries, sizeof (uint32_t), &allocated) ; + else + Ci64 = GB_malloc_memory (centries, sizeof (uint64_t), &allocated) ; + + if (centries > 0 && Ci32 == NULL && Ci64 == NULL) + { + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + + //---------------------------------------------------------------------- + // allocate Cx + //---------------------------------------------------------------------- + + void *Cx = NULL ; + if (C_iso) + { + Cx = GB_malloc_memory (1, op->ztype->size, &allocated) ; + if (Cx == NULL) + { + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + memcpy (Cx, cscalar, csize) ; + } + else + { + Cx = GB_malloc_memory (centries, op->ztype->size, &allocated) ; + if (centries > 0 && Cx == NULL) + { + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } + GB_FREE_WORKSPACE ; + return GrB_OUT_OF_MEMORY ; + } + } + + //---------------------------------------------------------------------- + // allocate and initialize C matrix header + //---------------------------------------------------------------------- + + GrB_Info Calloc = GB_new_bix (&C, op->ztype, vlen, Mask->vdim, + GB_ph_null, Mask->is_csc, GxB_SPARSE, false, Mask->hyper_switch, + Mask->vdim, centries, false, C_iso, + Cp_is_32, Cj_is_32, Ci_is_32) ; + if (Calloc != GrB_SUCCESS) + { + if (C_iso) + { + GB_free_memory (&Cx, op->ztype->size) ; + } + else + { + GB_free_memory (&Cx, centries * op->ztype->size) ; + } + if (Ci_is_32) + { + GB_free_memory (&Ci32, centries * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Ci64, centries * sizeof (uint64_t)) ; + } + if (Cp_is_32) + { + GB_free_memory (&Cp32, (Mask->vdim + 1) * sizeof (uint32_t)) ; + } + else + { + GB_free_memory (&Cp64, (Mask->vdim + 1) * sizeof (uint64_t)) ; + } + GB_FREE_WORKSPACE ; + return Calloc ; + } + + GB_free_memory (&C->i, C->i_size) ; + GB_free_memory (&C->x, C->x_size) ; + + C->p = Cp_is_32 ? (void *) Cp32 : (void *) Cp64 ; + C->i = Ci_is_32 ? (void *) Ci32 : (void *) Ci64 ; + C->x = Cx ; + C->p_size = (Cp_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) * (Mask->vdim + 1) ; + C->i_size = (Ci_is_32 ? sizeof (uint32_t) + : sizeof (uint64_t)) * centries ; + C->x_size = C->iso ? op->ztype->size + : op->ztype->size * centries ; + C->magic = GB_MAGIC ; + C->nvals = centries ; + C->nvec_nonempty = (int64_t) nvecs ; + + //---------------------------------------------------------------------- + // evaluate + //---------------------------------------------------------------------- + + info = GB_kroner_jit (C, op, false, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ; + + if (info != GrB_SUCCESS) + { + info = GB_kroner_generic (C, op, false, A, B, + A_is_pattern, B_is_pattern, A_transpose, B_transpose, + Mask, Mask_comp, Mask_struct, false, nthreads) ; + } + + GB_FREE_WORKSPACE ; + return info ; + } + + //-------------------------------------------------------------------------- + // default case (no mask, or complemented mask) + //-------------------------------------------------------------------------- + + //-------------------------------------------------------------------------- + // bitmap case: create sparse copies of A and B if they are bitmap + //-------------------------------------------------------------------------- + if (GB_IS_BITMAP (A)) - { + { GBURBLE ("A:") ; GB_CLEAR_MATRIX_HEADER (Awork, &Awork_header) ; GB_OK (GB_dup_worker (&Awork, A->iso, A, true, NULL)) ; @@ -80,9 +552,8 @@ GrB_Info GB_kroner // C = kron (A,B) A = Awork ; } - GrB_Matrix B = B_in ; if (GB_IS_BITMAP (B)) - { + { GBURBLE ("B:") ; GB_CLEAR_MATRIX_HEADER (Bwork, &Bwork_header) ; GB_OK (GB_dup_worker (&Bwork, B->iso, B, true, NULL)) ; @@ -102,7 +573,7 @@ GrB_Info GB_kroner // C = kron (A,B) const int64_t avlen = A->vlen ; const int64_t avdim = A->vdim ; const int64_t anvec = A->nvec ; - const int64_t anz = GB_nnz (A) ; + const int64_t anz = GB_nnz (A) ; GB_Bp_DECLARE (Bp, const) ; GB_Bp_PTR (Bp, B) ; GB_Bh_DECLARE (Bh, const) ; GB_Bh_PTR (Bh, B) ; @@ -110,7 +581,7 @@ GrB_Info GB_kroner // C = kron (A,B) const int64_t bvlen = B->vlen ; const int64_t bvdim = B->vdim ; const int64_t bnvec = B->nvec ; - const int64_t bnz = GB_nnz (B) ; + const int64_t bnz = GB_nnz (B) ; //-------------------------------------------------------------------------- // determine the number of threads to use @@ -118,57 +589,40 @@ GrB_Info GB_kroner // C = kron (A,B) double work = ((double) anz) * ((double) bnz) + (((double) anvec) * ((double) bnvec)) ; - int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; int nthreads = GB_nthreads (work, chunk, nthreads_max) ; - //-------------------------------------------------------------------------- - // check if C is iso and compute its iso value if it is - //-------------------------------------------------------------------------- - - GrB_Type ctype = op->ztype ; - const size_t csize = ctype->size ; - GB_void cscalar [GB_VLA(csize)] ; - bool C_iso = GB_emult_iso (cscalar, ctype, A, B, op) ; - //-------------------------------------------------------------------------- // allocate the output matrix C //-------------------------------------------------------------------------- - // C has the same type as z for the multiply operator, z=op(x,y) - uint64_t cvlen, cvdim, cnzmax, cnvec ; - bool ok = GB_int64_multiply (&cvlen, avlen, bvlen) ; - ok = ok & GB_int64_multiply (&cvdim, avdim, bvdim) ; - ok = ok & GB_int64_multiply (&cnzmax, anz, bnz) ; - ok = ok & GB_int64_multiply (&cnvec, anvec, bnvec) ; + bool ok = GB_int64_multiply (&cvlen, avlen, bvlen) ; + ok = ok & GB_int64_multiply (&cvdim, avdim, bvdim) ; + ok = ok & GB_int64_multiply (&cnzmax, anz, bnz) ; + ok = ok & GB_int64_multiply (&cnvec, anvec, bnvec) ; ASSERT (ok) ; if (C_iso) - { - // the values of A and B are no longer needed if C is iso + { GBURBLE ("(iso kron) ") ; A_is_pattern = true ; B_is_pattern = true ; } - // C is hypersparse if either A or B are hypersparse. It is never bitmap. bool C_is_hyper = (cvdim > 1) && (Ah != NULL || Bh != NULL) ; - bool C_is_full = GB_as_if_full (A) && GB_as_if_full (B) ; - int C_sparsity = C_is_full ? GxB_FULL : - ((C_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE) ; - - // determine the p_is_32, j_is_32, and i_is_32 settings for the new matrix + bool C_is_full = GB_as_if_full (A) && GB_as_if_full (B) ; + int C_sparsity = C_is_full ? GxB_FULL : + C_is_hyper ? GxB_HYPERSPARSE : GxB_SPARSE ; bool Cp_is_32, Cj_is_32, Ci_is_32 ; GB_determine_pji_is_32 (&Cp_is_32, &Cj_is_32, &Ci_is_32, C_sparsity, cnzmax, (int64_t) cvlen, (int64_t) cvdim, Werk) ; - GB_OK (GB_new_bix (&C, // full, sparse, or hyper; existing header - ctype, (int64_t) cvlen, (int64_t) cvdim, GB_ph_malloc, C_is_csc, - C_sparsity, true, B->hyper_switch, cnvec, cnzmax, true, C_iso, - Cp_is_32, Cj_is_32, Ci_is_32)) ; + GB_OK (GB_new_bix (&C, ctype, (int64_t) cvlen, (int64_t) cvdim, + GB_ph_malloc, C_is_csc, C_sparsity, true, B->hyper_switch, + cnvec, cnzmax, true, C_iso, Cp_is_32, Cj_is_32, Ci_is_32)) ; //-------------------------------------------------------------------------- // compute the column counts of C: Cp and Ch if C is hypersparse @@ -176,34 +630,25 @@ GrB_Info GB_kroner // C = kron (A,B) GB_Cp_DECLARE (Cp, ) ; GB_Cp_PTR (Cp, C) ; GB_Ch_DECLARE (Ch, ) ; GB_Ch_PTR (Ch, C) ; - #define GB_Cp_IS_32 Cp_is_32 + //#define GB_Cp_IS_32 Cp_is_32 if (!C_is_full) - { - // C is sparse or hypersparse + { int64_t kC ; #pragma omp parallel for num_threads(nthreads) schedule(static) - for (kC = 0 ; kC < cnvec ; kC++) + for (kC = 0 ; kC < (int64_t) cnvec ; kC++) { const int64_t kA = kC / bnvec ; const int64_t kB = kC % bnvec ; - // get A(:,jA), the (kA)th vector of A const int64_t jA = GBh_A (Ah, kA) ; const int64_t aknz = (Ap == NULL) ? avlen : (GB_IGET (Ap, kA+1) - GB_IGET (Ap, kA)) ; - // get B(:,jB), the (kB)th vector of B const int64_t jB = GBh_B (Bh, kB) ; const int64_t bknz = (Bp == NULL) ? bvlen : (GB_IGET (Bp, kB+1) - GB_IGET (Bp, kB)) ; - // determine # entries in C(:,jC), the (kC)th vector of C - // int64_t kC = kA * bnvec + kB ; - // Cp [kC] = aknz * bknz ; GB_ISET (Cp, kC, aknz * bknz) ; if (C_is_hyper) - { - // Ch [kC] = jA * bvdim + jB ; GB_ISET (Ch, kC, jA * bvdim + jB) ; - } } int64_t nvec_nonempty ; @@ -216,19 +661,17 @@ GrB_Info GB_kroner // C = kron (A,B) C->magic = GB_MAGIC ; //-------------------------------------------------------------------------- - // C = kron (A,B) where C is iso and/or full full + // C = kron (A,B) where C is iso and/or full //-------------------------------------------------------------------------- if (C_iso) - { - // C->x [0] = cscalar = op (A,B) + { memcpy (C->x, cscalar, csize) ; if (C_is_full) - { - // no more work to do if C is iso and full + { ASSERT_MATRIX_OK (C, "C=kron(A,B), iso full", GB0) ; GB_FREE_WORKSPACE ; - return (GrB_SUCCESS) ; + return GrB_SUCCESS ; } } @@ -238,93 +681,22 @@ GrB_Info GB_kroner // C = kron (A,B) int64_t cnz = GB_nnz (C) ; if (cnz == 0) - { + { GB_FREE_WORKSPACE ; - return (GrB_SUCCESS) ; + return GrB_SUCCESS ; } //-------------------------------------------------------------------------- - // C = kron (A,B) + // evaluate: JIT or generic //-------------------------------------------------------------------------- - // via the JIT kernel - info = GB_kroner_jit (C, op, flipij, A, B, nthreads) ; - - if (info == GrB_NO_VALUE) - { - // via the generic kernel - #define GB_A_TYPE GB_void - #define GB_B_TYPE GB_void - #define GB_C_TYPE GB_void - #define GB_A_ISO A_iso - #define GB_B_ISO B_iso - #define GB_C_ISO C_iso - const bool A_iso = A->iso ; - const bool B_iso = B->iso ; - const int64_t asize = A->type->size ; - const int64_t bsize = B->type->size ; - - GxB_binary_function fmult = op->binop_function ; - GxB_index_binary_function fmult_idx = op->idxbinop_function ; - const void *theta = op->theta ; - GB_cast_function cast_A = NULL, cast_B = NULL ; - if (!A_is_pattern) - { - cast_A = GB_cast_factory (op->xtype->code, A->type->code) ; - } - if (!B_is_pattern) - { - cast_B = GB_cast_factory (op->ytype->code, B->type->code) ; - } - - #define GB_C_IS_FULL C_is_full - - #define GB_DECLAREA(a) GB_void a [GB_VLA(asize)] - #define GB_DECLAREB(b) GB_void b [GB_VLA(bsize)] - - #define GB_GETA(a,Ax,p,iso) \ - { \ - if (!A_is_pattern) \ - { \ - cast_A (a, Ax + (p)*asize, asize) ; \ - } \ - } + info = GB_kroner_jit (C, op, flipij, A, A_transpose, B, B_transpose, Mask, Mask_struct, Mask_comp, nthreads) ; - #define GB_GETB(b,Bx,p,iso) \ - { \ - if (!B_is_pattern) \ - { \ - cast_B (b, Bx + (p)*bsize, bsize) ; \ - } \ - } - - #define GB_KRONECKER_OP(Cx,pC,a,ix,jx,b,iy,jy) \ - { \ - if (fmult != NULL) \ - { \ - /* standard binary operator */ \ - fmult (Cx +(pC)*csize, a, b) ; \ - } \ - else \ - { \ - /* index binary operator */ \ - if (flipij) \ - { \ - fmult_idx (Cx +(pC)*csize, \ - a, jx, ix, b, jy, iy, theta) ; \ - } \ - else \ - { \ - fmult_idx (Cx +(pC)*csize, \ - a, ix, jx, b, iy, jy, theta) ; \ - } \ - } \ - } - - #define GB_GENERIC - #include "ewise/include/GB_ewise_shared_definitions.h" - #include "kronecker/template/GB_kroner_template.c" - info = GrB_SUCCESS ; + if (info != GrB_SUCCESS) + { + info = GB_kroner_generic (C, op, flipij, A, B, + A_is_pattern, B_is_pattern, A_transpose, B_transpose, + Mask, Mask_comp, Mask_struct, C_is_full, nthreads) ; } //-------------------------------------------------------------------------- @@ -332,16 +704,11 @@ GrB_Info GB_kroner // C = kron (A,B) //-------------------------------------------------------------------------- if (info == GrB_SUCCESS) - { + { GB_OK (GB_hyper_prune (C, Werk)) ; ASSERT_MATRIX_OK (C, "C=kron(A,B)", GB0) ; } - //-------------------------------------------------------------------------- - // return result - //-------------------------------------------------------------------------- - GB_FREE_WORKSPACE ; - return (info) ; + return info ; } - diff --git a/Source/kronecker/template/GB_kroner_template.c b/Source/kronecker/template/GB_kroner_template.c index bdc76990c0..1520564a69 100644 --- a/Source/kronecker/template/GB_kroner_template.c +++ b/Source/kronecker/template/GB_kroner_template.c @@ -47,135 +47,327 @@ // C = kron (A,B) //-------------------------------------------------------------------------- - int tid ; - #pragma omp parallel for num_threads(nthreads) schedule(static) - for (tid = 0 ; tid < nthreads ; tid++) + if (!GB_NO_MASK && !GB_MASK_COMP) { + #define GB_GETVECTOR(A, row, col) GrB_Index vector_ = (A)->is_csc ? (col) : (row) + + #define GB_GETCOORD(A, row, col) GrB_Index coord_ = (A)->is_csc ? (row) : (col) + + #define GB_LOOKUP_XOFFSET_BITMAP_FULL(offset, A) \ + { \ + GrB_Index offset_ = vector_ * (A)->vlen + \ + coord_ ; \ + if ((A)->b == NULL || \ + (((int8_t *) (A)->b)[offset_])) \ + { \ + offset = (A)->iso ? 0 : offset_ ; \ + } \ + else \ + { \ + offset = -1 ; \ + } \ + } - //---------------------------------------------------------------------- - // get the iso values of A and B - //---------------------------------------------------------------------- + #define GB_LOOKUP_XOFFSET_SPARSE(offset, A) \ + { \ + start_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [vector_] : \ + ((int64_t *)(A)->p) [vector_] ; \ + end_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [vector_ + 1] : \ + ((int64_t *)(A)->p) [vector_ + 1] ; \ + end_-- ; \ + if (start_ > end_) \ + { \ + offset = -1 ; \ + } \ + else \ + { \ + if (GB_binary_search (coord_, (A)->i, \ + (A)->i_is_32, &start_, &end_)) \ + { \ + offset = (A)->iso ? 0 : start_ ; \ + } \ + else \ + { \ + offset = - 1 ; \ + } \ + } \ + } - GB_DECLAREA (a) ; - if (GB_A_ISO) - { - GB_GETA (a, Ax, 0, true) ; + #define GB_LOOKUP_XOFFSET_HYPER(offset, A) \ + { \ + start_ = 0 ; \ + end_ = (A)->plen - 1 ; \ + if (!GB_binary_search (vector_, (A)->h, \ + (A)->j_is_32, &start_, &end_)) \ + { \ + offset = - 1; \ + } \ + else \ + { \ + int64_t k_ = start_ ; \ + start_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [k_] : \ + ((int64_t *)(A)->p) [k_] ; \ + end_ = (A)->p_is_32 ? \ + ((int32_t *)(A)->p) [k_ + 1] : \ + ((int64_t *)(A)->p) [k_ + 1] ; \ + end_-- ; \ + if (start_ > end_) \ + { \ + offset = -1 ; \ + } \ + else \ + { \ + if (GB_binary_search (coord_, (A)->i, \ + (A)->i_is_32, &start_, &end_)) \ + { \ + offset = (A)->iso ? 0 : start_ ; \ + } \ + else \ + { \ + offset = -1 ; \ + } \ + } \ + } \ } - GB_DECLAREB (b) ; - if (GB_B_ISO) - { - GB_GETB (b, Bx, 0, true) ; + + #define GB_LOOKUP_XOFFSET(offset, A, row, col) \ + { \ + GB_GETVECTOR(A, row, col) ; \ + GB_GETCOORD(A, row, col) ; \ + if ((A)->p == NULL) \ + { \ + GB_LOOKUP_XOFFSET_BITMAP_FULL (offset, A) ; \ + } \ + else \ + { \ + int64_t start_, end_ ; \ + if ((A)->h == NULL) \ + { \ + GB_LOOKUP_XOFFSET_SPARSE(offset, A) ; \ + } \ + else \ + { \ + GB_LOOKUP_XOFFSET_HYPER(offset, A) ; \ + } \ + } \ } - //---------------------------------------------------------------------- - // construct the task to compute Ci,Cx [pC:pC_end-1] - //---------------------------------------------------------------------- + #define GB_NROWS(A) ((A)->is_csc ? (A)->vlen : (A)->vdim) + #define GB_NCOLS(A) ((A)->is_csc ? (A)->vdim : (A)->vlen) + - int64_t pC, pC_end ; - GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ; + GB_Mp_DECLARE(Mp, ) ; + GB_Mp_PTR(Mp, Mask) ; - // find where this task starts in C - int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec, - cvlen) ; - int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ; + GB_Mh_DECLARE(Mh, ) ; + GB_Mh_PTR(Mh, Mask) ; - //---------------------------------------------------------------------- - // compute C(:,kC) for all vectors kC in this task - //---------------------------------------------------------------------- + GB_Mi_DECLARE(Mi, ) ; + GB_Mi_PTR(Mi, Mask) ; - for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++) + + int64_t bnrows = (B_transpose) ? GB_NCOLS (B) : GB_NROWS (B) ; + int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ; + + #pragma omp parallel num_threads(nthreads) { + GrB_Index offset = -1 ; + GB_DECLAREA (a_elem) ; + GB_DECLAREB (b_elem) ; + int64_t vlen = Mask->vlen ; - //------------------------------------------------------------------ - // get the vectors C(:,jC), A(:,jA), and B(:,jB) - //------------------------------------------------------------------ - - // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C, - // where jC = GBh_C (Ch, kC) - int64_t kA = kC / bnvec ; - int64_t kB = kC % bnvec ; - - // get A(:,jA), the (kA)th vector of A - int64_t jA = GBh_A (Ah, kA) ; - int64_t pA_start = GBp_A (Ap, kA, avlen) ; - int64_t pA_end = GBp_A (Ap, kA+1, avlen) ; - - // get B(:,jB), the (kB)th vector of B - int64_t jB = GBh_B (Bh, kB) ; - int64_t pB_start = GBp_B (Bp, kB, bvlen) ; - int64_t pB_end = GBp_B (Bp, kB+1, bvlen) ; - int64_t bknz = pB_end - pB_start ; - - // shift into the middle of A(:,jA) and B(:,jB) for the first - // vector of C for this task. - int64_t pA_delta = 0 ; - int64_t pB_delta = 0 ; - if (kC == kC_task && bknz > 0) + #pragma omp for schedule(static) + for (GrB_Index k = 0 ; k < Mask->nvec ; k++) + { + GrB_Index j = GBh_M (Mh, k) ; + + int64_t pA_start = GBp_M (Mp, k, vlen) ; + int64_t pA_end = GBp_M (Mp, k+1, vlen) ; + GrB_Index pos = GB_IGET(Cp, j); + for (GrB_Index p = pA_start; p < pA_end; p++) + { + if (!GBb_M (Mask->b, p)) continue ; + + int64_t i = GBi_M (Mi, p, vlen) ; + GrB_Index Mrow = Mask->is_csc ? i : j ; GrB_Index Mcol = Mask->is_csc ? j : i ; + + // extract elements from A and B, + // initialize offset in MTi and MTx, + // get result of op, place it in MTx + + if (GB_MASK_STRUCT || (Mask->iso ? ((bool *)Mask->x)[0] : ((bool *)Mask->x)[p])) + { + GrB_Index arow = A_transpose ? (Mcol / bncols) : (Mrow / bnrows); + GrB_Index acol = A_transpose ? (Mrow / bnrows) : (Mcol / bncols); + + GrB_Index brow = B_transpose ? (Mcol % bncols) : (Mrow % bnrows); + GrB_Index bcol = B_transpose ? (Mrow % bnrows) : (Mcol % bncols); + + GB_LOOKUP_XOFFSET (offset, A, arow, acol) ; + if (offset == -1) + { + continue; + } + // iso of A or of C (?) + GB_GETA (a_elem, Ax, offset, GB_A_ISO) ; + + GB_LOOKUP_XOFFSET (offset, B, brow, bcol) ; + if (offset == -1) + { + continue; + } + GB_GETB (b_elem, Bx, offset, GB_B_ISO) ; + + GrB_Index ix, jx, iy, jy ; + ix = A_transpose ? acol : arow ; + jx = A_transpose ? arow : acol ; + iy = B_transpose ? bcol : brow ; + jy = B_transpose ? brow : bcol ; + + GB_ISET (Ci, pos, i) ; + + GB_KRONECKER_OP (Cx, pos, a_elem, ix, jx, b_elem, iy, jy) ; + + pos++ ; + } + } + } + } + } + else + { + int tid ; + #pragma omp parallel for num_threads(nthreads) schedule(static) + for (tid = 0 ; tid < nthreads ; tid++) + { + + //---------------------------------------------------------------------- + // get the iso values of A and B + //---------------------------------------------------------------------- + + GB_DECLAREA (a) ; + if (GB_A_ISO) + { + GB_GETA (a, Ax, 0, true) ; + } + GB_DECLAREB (b) ; + if (GB_B_ISO) { - pA_delta = pC_delta / bknz ; - pB_delta = pC_delta % bknz ; + GB_GETB (b, Bx, 0, true) ; } - //------------------------------------------------------------------ - // for all entries in A(:,jA), skipping entries for first vector - //------------------------------------------------------------------ + //---------------------------------------------------------------------- + // construct the task to compute Ci,Cx [pC:pC_end-1] + //---------------------------------------------------------------------- - int64_t pA = pA_start + pA_delta ; - pA_delta = 0 ; - for ( ; pA < pA_end && pC < pC_end ; pA++) - { + int64_t pC, pC_end ; + GB_PARTITION (pC, pC_end, cnz, tid, nthreads) ; + + // find where this task starts in C + int64_t kC_task = GB_search_for_vector (Cp, GB_Cp_IS_32, pC, 0, cnvec, + cvlen) ; + int64_t pC_delta = pC - GBp_C (Cp, kC_task, cvlen) ; - //-------------------------------------------------------------- - // a = A(iA,jA), typecasted to op->xtype - //-------------------------------------------------------------- + //---------------------------------------------------------------------- + // compute C(:,kC) for all vectors kC in this task + //---------------------------------------------------------------------- - int64_t iA = GBi_A (Ai, pA, avlen) ; - int64_t iAblock = iA * bvlen ; - if (!GB_A_ISO) + for (int64_t kC = kC_task ; kC < cnvec && pC < pC_end ; kC++) + { + + //------------------------------------------------------------------ + // get the vectors C(:,jC), A(:,jA), and B(:,jB) + //------------------------------------------------------------------ + + // C(:,jC) = kron (A(:,jA), B(:,jB), the (kC)th vector of C, + // where jC = GBh_C (Ch, kC) + int64_t kA = kC / bnvec ; + int64_t kB = kC % bnvec ; + + // get A(:,jA), the (kA)th vector of A + int64_t jA = GBh_A (Ah, kA) ; + int64_t pA_start = GBp_A (Ap, kA, avlen) ; + int64_t pA_end = GBp_A (Ap, kA+1, avlen) ; + + // get B(:,jB), the (kB)th vector of B + int64_t jB = GBh_B (Bh, kB) ; + int64_t pB_start = GBp_B (Bp, kB, bvlen) ; + int64_t pB_end = GBp_B (Bp, kB+1, bvlen) ; + int64_t bknz = pB_end - pB_start ; + + // shift into the middle of A(:,jA) and B(:,jB) for the first + // vector of C for this task. + int64_t pA_delta = 0 ; + int64_t pB_delta = 0 ; + if (kC == kC_task && bknz > 0) { - GB_GETA (a, Ax, pA, false) ; + pA_delta = pC_delta / bknz ; + pB_delta = pC_delta % bknz ; } - //-------------------------------------------------------------- - // for all entries in B(:,jB), skipping entries for 1st vector - //-------------------------------------------------------------- + //------------------------------------------------------------------ + // for all entries in A(:,jA), skipping entries for first vector + //------------------------------------------------------------------ - // scan B(:,jB), skipping to the first entry of C if this is - // the first time B is accessed in this task - int64_t pB = pB_start + pB_delta ; - pB_delta = 0 ; - for ( ; pB < pB_end && pC < pC_end ; pB++) - { + int64_t pA = pA_start + pA_delta ; + pA_delta = 0 ; + for ( ; pA < pA_end && pC < pC_end ; pA++) + { - //---------------------------------------------------------- - // b = B(iB,jB), typecasted to op->ytype - //---------------------------------------------------------- + //-------------------------------------------------------------- + // a = A(iA,jA), typecasted to op->xtype + //-------------------------------------------------------------- - int64_t iB = GBi_B (Bi, pB, bvlen) ; - if (!GB_B_ISO) + int64_t iA = GBi_A (Ai, pA, avlen) ; + int64_t iAblock = iA * bvlen ; + if (!GB_A_ISO) { - GB_GETB (b, Bx, pB, false) ; + GB_GETA (a, Ax, pA, false) ; } - //---------------------------------------------------------- - // C(iC,jC) = A(iA,jA) * B(iB,jB) - //---------------------------------------------------------- + //-------------------------------------------------------------- + // for all entries in B(:,jB), skipping entries for 1st vector + //-------------------------------------------------------------- - if (!GB_C_IS_FULL) - { - // save the row index iC - // Ci [pC] = iAblock + iB ; - GB_ISET (Ci, pC, iAblock + iB) ; - } - // Cx [pC] = op (a, b) - if (!GB_C_ISO) + // scan B(:,jB), skipping to the first entry of C if this is + // the first time B is accessed in this task + int64_t pB = pB_start + pB_delta ; + pB_delta = 0 ; + for ( ; pB < pB_end && pC < pC_end ; pB++) { - GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ; + + //---------------------------------------------------------- + // b = B(iB,jB), typecasted to op->ytype + //---------------------------------------------------------- + + int64_t iB = GBi_B (Bi, pB, bvlen) ; + if (!GB_B_ISO) + { + GB_GETB (b, Bx, pB, false) ; + } + + //---------------------------------------------------------- + // C(iC,jC) = A(iA,jA) * B(iB,jB) + //---------------------------------------------------------- + + if (!GB_C_IS_FULL) + { + // save the row index iC + // Ci [pC] = iAblock + iB ; + GB_ISET (Ci, pC, iAblock + iB) ; + } + // Cx [pC] = op (a, b) + if (!GB_C_ISO) + { + GB_KRONECKER_OP (Cx, pC, a, iA, jA, b, iB, jB) ; + } + pC++ ; } - pC++ ; } } } } } - diff --git a/Test/test226.m b/Test/test226.m index 41f77fd3d3..7e3759128a 100644 --- a/Test/test226.m +++ b/Test/test226.m @@ -9,6 +9,8 @@ A.matrix = sprand (5, 10, 0.4) ; B.matrix = ones (3, 2) ; B.iso = true ; +M.matrix = sprand (15, 20, 0.2) ~= 0 ; +MT.matrix = sprand (9, 4, 20, 0.2) ~= 0 ; mult.opname = 'times' ; mult.optype = 'double' ; @@ -18,14 +20,25 @@ C2 = GB_spec_kron (Cin, [ ], [ ], mult, A, B, [ ]) ; GB_spec_compare (C1, C2) ; +C1 = GB_mex_kron (Cin, M, [ ], mult, A, B, [ ]) ; +C2 = GB_spec_kron (Cin, M, [ ], mult, A, B, [ ]) ; +GB_spec_compare (C1, C2) ; + C1 = GB_mex_kron (Cin, [ ], [ ], mult, B, A, [ ]) ; C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, A, [ ]) ; GB_spec_compare (C1, C2) ; +C1 = GB_mex_kron (Cin, M, [ ], mult, B, A, [ ]) ; +C2 = GB_spec_kron (Cin, M, [ ], mult, B, A, [ ]) ; +GB_spec_compare (C1, C2) ; + Cin = sparse (9, 4) ; C1 = GB_mex_kron (Cin, [ ], [ ], mult, B, B, [ ]) ; C2 = GB_spec_kron (Cin, [ ], [ ], mult, B, B, [ ]) ; GB_spec_compare (C1, C2) ; -fprintf ('\ntest226: all tests passed\n') ; +C1 = GB_mex_kron (Cin, MT, [ ], mult, B, B, [ ]) ; +C2 = GB_spec_kron (Cin, MT, [ ], mult, B, B, [ ]) ; +GB_spec_compare (C1, C2) ; +fprintf ('\ntest226: all tests passed\n') ; diff --git a/Test/test227.m b/Test/test227.m index 98e75da2e2..c45653c91e 100644 --- a/Test/test227.m +++ b/Test/test227.m @@ -16,6 +16,16 @@ dnt = struct ( 'inp1', 'tran' ) ; dtt = struct ( 'inp0', 'tran', 'inp1', 'tran' ) ; +dnn.mask = 'default' ; +dtn.mask = 'default' ; +dnt.mask = 'default' ; +dtt.mask = 'default' ; + +dnn.outp = 'default' ; +dtn.outp = 'default' ; +dnt.outp = 'default' ; +dtt.outp = 'default' ; + types = { 'int32', 'int64', 'single', 'double' } ; am = 5 ; @@ -23,11 +33,13 @@ bm = 4 ; bn = 2 ; -Ax = sparse (100 * sprandn (am,an, 0.5)) ; -Bx = sparse (100 * sprandn (bm,bn, 0.5)) ; +Ax = sparse (100 * sprandn (am, an, 0.5)) ; +Bx = sparse (100 * sprandn (bm, bn, 0.5)) ; + cm = am * bm ; cn = an * bn ; -Cx = sparse (cm,cn) ; +Cx = sparse (cm, cn) ; +Maskmat = sprandn (cm, cn, 0.2) ~= 0 ; AT = Ax' ; BT = Bx' ; @@ -56,16 +68,16 @@ clear A A.matrix = Ax ; A.is_hyper = A_is_hyper ; - A.is_csc = A_is_csc ; + A.is_csc = A_is_csc ; clear B B.matrix = Bx ; B.is_hyper = B_is_hyper ; - B.is_csc = B_is_csc ; + B.is_csc = B_is_csc ; clear C C.matrix = Cx ; - C.is_csc = C_is_csc ; + C.is_csc = C_is_csc ; %--------------------------------------- % kron(A,B) @@ -103,6 +115,42 @@ C1 = GB_mex_kron (C, [ ], [ ], op, AT, BT, dtt) ; GB_spec_compare (C0, C1) ; + % tests with Mask + for Mask_is_hyper = 0:1 + for Mask_is_csc = 0:1 + + A.is_csc = A_is_csc ; + B.is_csc = B_is_csc ; + A.is_hyper = A_is_hyper ; + B.is_hyper = B_is_hyper ; + + clear M + M.matrix = Maskmat ; + M.is_hyper = Mask_is_hyper ; + M.is_csc = Mask_is_csc; + C.is_csc = C_is_csc ; + + % kron(A, B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, B, dnn) ; + C1 = GB_mex_kron (C, M, [ ], op, A, B, dnn) ; + GB_spec_compare(C0, C1) ; + + % kron(A', B) with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, B, dtn) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, B, dtn) ; + GB_spec_compare (C0, C1) ; + + % kron(A, B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, A, BT, dnt) ; + C1 = GB_mex_kron (C, M, [ ], op, A, BT, dnt) ; + GB_spec_compare (C0, C1) ; + + % kron(A', B') with Mask + C0 = GB_spec_kron (C, M, [ ], op, AT, BT, dtt) ; + C1 = GB_mex_kron (C, M, [ ], op, AT, BT, dtt) ; + GB_spec_compare (C0, C1) ; + end + end end end end @@ -126,4 +174,3 @@ GB_spec_compare (C0, C1) ; fprintf ('\ntest227: all tests passed\n') ; -