Add support for multiple multiplication strategies

mfasi · mfasi · commit a14e98eb6eb3 · 2024-12-29T02:21:26.000Z
diff --git a/include/gemmi.hpp b/include/gemmi.hpp
@@ -24,16 +24,21 @@ enum class normalisationDimension {
     byCols  // Matrix on the right of the product.
 };
 
-enum class splittingStrategy{
+enum class splittingStrategy {
     roundToNearest,
     bitMasking
 };
 
-enum class accumulationStrategy{
+enum class accumulationStrategy {
     floatingPoint,
     integer
 };
 
+enum class multiplicationStrategy {
+    full,
+    reduced
+};
+
 template <typename splitint_t, typename fp_t>
 struct MatrixSplit {
     size_t m;
@@ -263,14 +268,13 @@ fp_t computeScalingConstantforUsingSplittingStrategy(const MatrixSplit<splitint_
 template <typename splitint_t, typename accumulator_t, typename fp_t>
 std::vector<fp_t> computeProductsWithFloatingPointAccumulation(const MatrixSplit<splitint_t, fp_t> &A,
                                   const MatrixSplit<splitint_t, fp_t> &B,
-                                  const size_t bitsPerSlice) {
+                                  const size_t bitsPerSlice,
+                                  const size_t numDiagonals) {
 
     std::vector<fp_t > C (A.m * B.n);
 
     auto scalingConstant = computeScalingConstantforUsingSplittingStrategy(A, B);
 
-    // Products below the main anti-diagonal are ignored.
-    size_t numDiagonals = std::max(A.numSplits, B.numSplits) - 1;
     for (size_t diagonal = 0; diagonal <= numDiagonals; diagonal++) {
         int Aindex = diagonal < A.numSplits - 1 ? diagonal : A.numSplits - 1;
         size_t Bindex = diagonal > A.numSplits - 1 ? diagonal - A.numSplits + 1 : 0;
@@ -304,14 +308,13 @@ std::vector<fp_t> computeProductsWithFloatingPointAccumulation(const MatrixSplit
 template <typename splitint_t, typename accumulator_t, typename fp_t>
 std::vector<fp_t> computeProductsWithIntegerAccumulation(const MatrixSplit<splitint_t, fp_t> &A,
                                   const MatrixSplit<splitint_t, fp_t> &B,
-                                  const size_t bitsPerSlice) {
+                                  const size_t bitsPerSlice,
+                                  const size_t numDiagonals) {
 
     std::vector<fp_t > C (A.m * B.n);
 
     auto scalingConstant = computeScalingConstantforUsingSplittingStrategy(A, B);
 
-    // Products below the main anti-diagonal are ignored.
-    size_t numDiagonals = std::max(A.numSplits, B.numSplits) - 1;
     for (size_t diagonal = 0; diagonal <= numDiagonals; diagonal++) {
         int Aindex = diagonal < A.numSplits - 1 ? diagonal : A.numSplits - 1;
         size_t Bindex = diagonal > A.numSplits - 1 ? diagonal - A.numSplits + 1 : 0;
@@ -343,6 +346,7 @@ std::vector<fp_t> gemmi (const std::vector<fp_t> &A, const std::vector<fp_t> &B,
                          const size_t m, const size_t p, const size_t n,
                          const size_t numSplitsA, const size_t numSplitsB,
                          const splittingStrategy splitType = splittingStrategy::roundToNearest,
+                         const multiplicationStrategy multType = multiplicationStrategy::reduced,
                          const accumulationStrategy accType = accumulationStrategy::floatingPoint) {
 
     const size_t bitsInAccumulator = std::numeric_limits<accumulator_t>::digits;
@@ -354,11 +358,26 @@ std::vector<fp_t> gemmi (const std::vector<fp_t> &A, const std::vector<fp_t> &B,
     auto splitA = MatrixSplit<splitint_t, fp_t>(A, m, p, splitType, numSplitsA, bitsPerSlice, normalisationDimension::byRows);
     auto splitB = MatrixSplit<splitint_t, fp_t>(B, p, n, splitType, numSplitsB, bitsPerSlice, normalisationDimension::byCols);
 
+    size_t numDiagonals;
+    switch (multType) {
+        case multiplicationStrategy::reduced:
+            // Products below the main anti-diagonal are ignored.
+            numDiagonals = std::max(splitA.numSplits, splitB.numSplits) - 1;
+            break;
+        case multiplicationStrategy::full:
+            // All products are computed.
+            numDiagonals = splitA.numSplits + splitB.numSplits - 1;
+            break;
+        default:
+            std::cerr << "Unknown multiplication strategy requested.";
+            exit(1);
+    }
+
     switch (accType) {
         case accumulationStrategy::floatingPoint:
-            return computeProductsWithFloatingPointAccumulation<splitint_t, accumulator_t, fp_t>(splitA, splitB, bitsPerSlice);
+            return computeProductsWithFloatingPointAccumulation<splitint_t, accumulator_t, fp_t>(splitA, splitB, bitsPerSlice, numDiagonals);
         case accumulationStrategy::integer:
-            return computeProductsWithIntegerAccumulation<splitint_t, accumulator_t, fp_t>(splitA, splitB, bitsPerSlice);
+            return computeProductsWithIntegerAccumulation<splitint_t, accumulator_t, fp_t>(splitA, splitB, bitsPerSlice, numDiagonals);
         default:
             std::cerr << "Unknown accumulation strategy requested.";
             exit(1);
diff --git a/mex/gemmi.cpp b/mex/gemmi.cpp
@@ -4,6 +4,7 @@
 
 typedef struct {
     splittingStrategy splitType;
+    multiplicationStrategy multType;
     accumulationStrategy accType;
 } algorithmOptions;
 static std::unique_ptr<algorithmOptions> options = nullptr;
@@ -45,6 +46,7 @@ class MexFunction : public matlab::mex::Function {
             matlab::data::ArrayFactory factory;
             matlab::data::StructArray S = factory.createStructArray({1, 1}, {"split", "acc"});
             S[0]["split"] = factory.createCharArray(options->splitType == splittingStrategy::roundToNearest ? "n" : "b");
+            S[0]["mult"] = factory.createCharArray(options->multType == multiplicationStrategy::full ? "f" : "r");
             S[0]["acc"] = factory.createCharArray(options->accType == accumulationStrategy::floatingPoint ? "f" : "i");
             outputs[1] = std::move(S);
         }
@@ -59,8 +61,8 @@ class MexFunction : public matlab::mex::Function {
         auto A_size = Amatlab.getDimensions();
         auto B_size = Bmatlab.getDimensions();
 
-        auto C = gemmi<double, int8_t, int32_t>(A, B, A_size[0], A_size[1], B_size[1],
-                                                numSplitsA, numSplitsB, options->splitType, options->accType);
+        auto C = gemmi<double, int8_t, int32_t>(A, B, A_size[0], A_size[1], B_size[1], numSplitsA, numSplitsB,
+                                                options->splitType, options->multType, options->accType);
 
         matlab::data::ArrayFactory factory;
         return factory.createArray({A_size[0], B_size[1]}, C.begin(), C.end());;
@@ -74,8 +76,8 @@ class MexFunction : public matlab::mex::Function {
         auto A_size = Amatlab.getDimensions();
         auto B_size = Bmatlab.getDimensions();
 
-        auto C = gemmi<float, int8_t, int32_t>(A, B, A_size[0], A_size[1], B_size[1],
-                                               numSplitsA, numSplitsB, options->splitType, options->accType);
+        auto C = gemmi<float, int8_t, int32_t>(A, B, A_size[0], A_size[1], B_size[1], numSplitsA, numSplitsB,
+                                               options->splitType, options->multType, options->accType);
 
         matlab::data::ArrayFactory factory;
         return factory.createArray({A_size[0], B_size[1]}, C.begin(), C.end());;
@@ -145,22 +147,22 @@ class MexFunction : public matlab::mex::Function {
                     0, std::vector<matlab::data::Array>({ factory.createScalar("The fifth input must be a struct.") }));
             }
             matlab::data::StructArray inStruct(inputs[4]);
-            if (inStruct.getNumberOfFields() > 2) {
+            if (inStruct.getNumberOfFields() > 3) {
                 matlabPtr->feval(u"error",
-                    0, std::vector<matlab::data::Array>({ factory.createScalar("The fifth input must have at most two fields.") }));
+                    0, std::vector<matlab::data::Array>({ factory.createScalar("The fifth input must have at most three fields.") }));
             }
             auto fields = inStruct.getFieldNames();
             std::vector<matlab::data::MATLABFieldIdentifier> fieldNames(fields.begin(), fields.end());
             for (auto field : fieldNames) {
-                if (std::string(field) != "split" && std::string(field) != "acc") {
+                if (std::string(field) != "split" && std::string(field) != "mult" && std::string(field) != "acc") {
                     matlabPtr->feval(u"error",
-                        0, std::vector<matlab::data::Array>({ factory.createScalar("The fifth input's fields can only be named 'split' or 'acc'.") }));
+                        0, std::vector<matlab::data::Array>({ factory.createScalar("The fifth input's fields can only be named 'split', 'mult', or 'acc'.") }));
                 } else {
                     if (inStruct[0][field].getNumberOfElements() != 1 || inStruct[0][field].getType() != matlab::data::ArrayType::CHAR) {
                         matlabPtr->feval(u"error",
-                            0, std::vector<matlab::data::Array>({ factory.createScalar("The field of the struct should be single characters.") }));
+                            0, std::vector<matlab::data::Array>({ factory.createScalar("Each field of the struct should be a single character.") }));
                     }
-                    const matlab::data::TypedArray<char16_t> data = inStruct[0][field];
+                    const matlab::data::TypedArrayRef<char16_t> data = inStruct[0][field];
                     if (std::string(field) == "split") {
                         switch ((char)data[0]) {
                             case 'n':
@@ -174,6 +176,19 @@ class MexFunction : public matlab::mex::Function {
                                     0, std::vector<matlab::data::Array>({ factory.createScalar("Specified 'split' is invalid.") }));
                                 break;
                         }
+                    } else if (std::string(field) == "mult") {
+                        switch ((char)(data[0])) {
+                            case 'f':
+                                options->multType = multiplicationStrategy::full;
+                                break;
+                            case 'r':
+                                options->multType = multiplicationStrategy::reduced;
+                                break;
+                            default:
+                                matlabPtr->feval(u"error",
+                                    0, std::vector<matlab::data::Array>({ factory.createScalar("Specified 'mult' is invalid.") }));
+                                break;
+                        }
                     } else if (std::string(field) == "acc") {
                         switch ((char)data[0]) {
                             case 'f':
diff --git a/mex/gemmi.m b/mex/gemmi.m
@@ -1,27 +1,31 @@
 %  GEMMI    Compute matrix product using integer Ozaki scheme.
 %   [C, ALGOUT] = GEMMI(A,B,ASPLITS,BSPLITS,ALGIN) computes the matrix
-%   C = A*B using the Ozaki scheme with ASPLITS and BSPLITS slices 
-%   for the matrices A and B, respectively. The ALGIN parameter
+%   C = A*B using the Ozaki scheme with ASPLITS and BSPLITS slices
+%   for the matrices A and B, respectively.The ALGIN parameter
 %   must be a struct, with the following fields currently supported.
 %   'split' - selects the stragegy to be used to split A and B into
 %             slices. Possible values are 'b' for bitmasking and 'n'
-%             for round-to-nearest (default). 
+%             for round-to-nearest (default).
+%   'mult'  - selects how many integer multiplications the algorithm
+%             will perform in order to compute the result. Possible
+%             values are 'a' for all ASPLIT * BSPLIT products and 'r'
+%             for a reduced number (default).
 %   'acc'   - selects how the exact integer matrix products are
 %             accumulated. Possible values are 'f' for floating-point
 %             arithmetic and 'i' for integer accumulation (default).
 %   The output paramater ALGOUT is a struct with the same fields as
 %   ALGIN, which contains the values used in the computation.
-%    
+%
 %   [...] = GEMMI(A,B,ASPLITS,BSPLITS) uses the ALGIN parameter passed
 %   the most recent call to GEMMI, or the default values if no previous
 %   call was made.
 %
 %   [...] = GEMMI(A,B,SPLITS) uses SPLITS slices for both A and B.
-% 
+%
 %   The splits are stored as 8-bit signed integer, the dot products are
 %   performed using 32-bit signed arithmetic, and the final accumulation
 %   uses either the same format as the matrices A and B (if 'acc' is 'f')
 %   or 32-bit arithmetic (if 'acc' is 'i').
 %
 %   The matrices A and B must be conformable, and multiplication by a
-%   scalar is not supported.
+%   scalar is not supported.
diff --git a/tests/tests.cpp b/tests/tests.cpp
@@ -13,31 +13,32 @@ template <> double tolerance<double>() {return 1e-15;}
 
 template <typename fp_t>
 void runTest() {
-    // Test different sizes
-    for (auto splitType : {splittingStrategy::bitMasking,splittingStrategy::roundToNearest}) {
-        for (auto accumulationType : {accumulationStrategy::floatingPoint, accumulationStrategy::integer}) {
-            for (size_t numSplitA : { 1, 2, 10 }) {
-                for (size_t numSplitB : { 1, 2, 10 }) {
-                    for (size_t m = 10; m <= 50; m += 10) {
-                        for (size_t p = 10; p <= 50; p += 10) {
-                            for (size_t n = 10; n <= 50; n += 10) {
-                                std::vector<fp_t> A(m * p);
-                                std::vector<fp_t> B(p * n);
-
-                                // Initalize matrix with random values.
-                                std::default_random_engine generator(std::random_device{}());
-                                std::uniform_real_distribution<fp_t> distribution(-100000.0, 100000.0);
-                                for (auto & element : A)
-                                    element = numSplitA < 10 ? ldexp(1.0, 2 * numSplitA) - 1 : distribution(generator);
-                                for (auto & element : B)
-                                    element = numSplitB < 10 ? ldexp(1.0, 2 * numSplitB) - 1 : distribution(generator);
-
-                                auto C = gemmi<fp_t, int8_t, int32_t>(A, B, m, p, n, numSplitA, numSplitB, splitType, accumulationType);
-                                auto C_ref = reference_gemm(A, B, m, p, n);
-
-                                double relative_error = frobenius_norm<fp_t, double>(C - C_ref) / frobenius_norm<fp_t, double>(C);
-
-                                REQUIRE(relative_error < tolerance<fp_t>());
+    for (auto splitType : {splittingStrategy::bitMasking, splittingStrategy::roundToNearest}) {
+        for (auto multiplicationType : {multiplicationStrategy::reduced, multiplicationStrategy::full}) {
+            for (auto accumulationType : {accumulationStrategy::floatingPoint, accumulationStrategy::integer}) {
+                for (size_t numSplitA : { 1, 2, 10 }) {
+                    for (size_t numSplitB : { 1, 2, 10 }) {
+                        for (size_t m = 10; m <= 50; m += 10) {
+                            for (size_t p = 10; p <= 50; p += 10) {
+                                for (size_t n = 10; n <= 50; n += 10) {
+                                    std::vector<fp_t> A(m * p);
+                                    std::vector<fp_t> B(p * n);
+
+                                    // Initalize matrix with random values.
+                                    std::default_random_engine generator(std::random_device{}());
+                                    std::uniform_real_distribution<fp_t> distribution(-100000.0, 100000.0);
+                                    for (auto & element : A)
+                                        element = numSplitA < 10 ? ldexp(1.0, 2 * numSplitA) - 1 : distribution(generator);
+                                    for (auto & element : B)
+                                        element = numSplitB < 10 ? ldexp(1.0, 2 * numSplitB) - 1 : distribution(generator);
+
+                                    auto C = gemmi<fp_t, int8_t, int32_t>(A, B, m, p, n, numSplitA, numSplitB, splitType, multiplicationType, accumulationType);
+                                    auto C_ref = reference_gemm(A, B, m, p, n);
+
+                                    double relative_error = frobenius_norm<fp_t, double>(C - C_ref) / frobenius_norm<fp_t, double>(C);
+
+                                    REQUIRE(relative_error < tolerance<fp_t>());
+                                }
                             }
                         }
                     }