dalcinl
diff --git a/‎CHANGES‎
Lines changed: 5 additions & 0 deletions b/‎CHANGES‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎configure.ac‎
Lines changed: 179 additions & 145 deletions b/‎configure.ac‎
Lines changed: 179 additions & 145 deletions
diff --git a/‎maint/gen_abi.py‎
Lines changed: 0 additions & 2 deletions b/‎maint/gen_abi.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/include/mpi.h.in‎
Lines changed: 8 additions & 0 deletions b/‎src/include/mpi.h.in‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/include/mpir_datatype.h‎
Lines changed: 1 addition & 0 deletions b/‎src/include/mpir_datatype.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/include/mpir_objects.h‎
Lines changed: 1 addition & 1 deletion b/‎src/include/mpir_objects.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mpi/coll/op/op_fns.c‎
Lines changed: 83 additions & 0 deletions b/‎src/mpi/coll/op/op_fns.c‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/mpi/datatype/typerep/src/typerep_yaksa_init.c‎
Lines changed: 6 additions & 0 deletions b/‎src/mpi/datatype/typerep/src/typerep_yaksa_init.c‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/mpi/datatype/typerep/src/typerep_yaksa_pack_external.c‎
Lines changed: 2 additions & 2 deletions b/‎src/mpi/datatype/typerep/src/typerep_yaksa_pack_external.c‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/mpi/datatype/typeutil.c‎
Lines changed: 44 additions & 11 deletions b/‎src/mpi/datatype/typeutil.c‎
Lines changed: 44 additions & 11 deletions
@@ -22,6 +22,11 @@ of MPI_INT. Commonly used types include MPI_BYTE, MPI_CHAR, MPI_AINT, use
 MPIR_BYTE_INTERNAL, MPIR_CHAR_INTERNAL, MPIR_AINT_INTERNAL instead. There is no
 impact to users.
 
+# Added MPI_LOGICAL1, MPI_LOGICAL2, MPI_LOGICAL4, MPI_LOGICAL8, and MPI_LOGICAL16.
+
+# Added MPIX_BFLOAT16, and added software reduction support for MPIX_BFLOAT16
+  and MPIX_C_FLOAT16.
+
 ===============================================================================
                                Changes in 4.3
 ===============================================================================
 
@@ -60,8 +60,6 @@ def gen_mpi_abi_internal_h(out):
                 elif T == "MPI_Datatype":
                     idx = int(val, 0) & G.datatype_mask
                     G.abi_datatypes[idx] = name
-                    if re.match(r'MPI_LOGICAL\d+', name):
-                        G.abi_datatypes[idx] = "MPI_DATATYPE_NULL"
                 elif T == "MPI_Op":
                     idx = int(val, 0) & G.op_mask
                     G.abi_ops[idx] = name
 
@@ -253,6 +253,14 @@ typedef int MPI_Datatype;
 #define MPI_COUNT         ((MPI_Datatype)0x4c000845)
 /* other extension types */
 #define MPIX_C_FLOAT16    ((MPI_Datatype)0x4c000246)
+/* Fortran fixed-width logicals */
+#define MPI_LOGICAL1      ((MPI_Datatype)0x4c000147)
+#define MPI_LOGICAL2      ((MPI_Datatype)0x4c000248)
+#define MPI_LOGICAL4      ((MPI_Datatype)0x4c000449)
+#define MPI_LOGICAL8      ((MPI_Datatype)0x4c00084a)
+#define MPI_LOGICAL16     ((MPI_Datatype)0x4c00104b)
+/* other */
+#define MPIX_BFLOAT16     ((MPI_Datatype)0x4c00024c)
 
 /* Communicators */
 typedef int MPI_Comm;
 
@@ -72,6 +72,7 @@
 #define MPIR_COMPLEX32          ((MPI_Datatype)0x4c840800)
 #define MPIR_COMPLEX64          ((MPI_Datatype)0x4c841000)
 #define MPIR_COMPLEX128         ((MPI_Datatype)0x4c842000)
+#define MPIR_BFLOAT16           ((MPI_Datatype)0x4c850200)      /* bfloat16, use MPIR_TYPE_ALT_FLOAT */
 #define MPIR_ALT_FLOAT96        ((MPI_Datatype)0x4c850c00)      /* long double (80-bit extended precision) on i386 */
 #define MPIR_ALT_FLOAT128       ((MPI_Datatype)0x4c851000)      /* long double (80-bit extended precision) on x86-64 */
 #define MPIR_ALT_COMPLEX96      ((MPI_Datatype)0x4c861800)      /* long double complex on i386 */
 
@@ -217,7 +217,7 @@ const char *MPIR_Handle_get_kind_str(int kind);
 #define MPIR_GROUP_PREALLOC 8
 #endif
 
-#define MPIR_DATATYPE_N_BUILTIN 71
+#define MPIR_DATATYPE_N_BUILTIN 77      /* 0x4d - must be in sync with mpi.h.in */
 #ifdef MPID_DATATYPE_PREALLOC
 #define MPIR_DATATYPE_PREALLOC MPID_DATATYPE_PREALLOC
 #else
 
@@ -11,6 +11,9 @@
 
 #define MPIR_LSUM(a,b) ((a)+(b))
 
+static void bfloat16_sum(void *invec, void *inoutvec, MPI_Aint len);
+static void f16_sum(void *invec, void *inoutvec, MPI_Aint len);
+
 void MPIR_SUM(void *invec, void *inoutvec, MPI_Aint * Len, MPI_Datatype * type)
 {
     MPI_Aint i, len = *Len;
@@ -35,6 +38,14 @@ void MPIR_SUM(void *invec, void *inoutvec, MPI_Aint * Len, MPI_Datatype * type)
             break;                                         \
         }
                 MPIR_OP_TYPE_GROUP(COMPLEX)
+        case MPIR_BFLOAT16:
+            bfloat16_sum(invec, inoutvec, len);
+            break;
+#ifndef MPIR_FLOAT16_CTYPE
+        case MPIR_FLOAT16:
+            f16_sum(invec, inoutvec, len);
+            break;
+#endif
         default:
             MPIR_Assert(0);
             break;
@@ -442,3 +453,75 @@ void MPIR_REPLACE(void *invec, void *inoutvec, MPI_Aint * Len, MPI_Datatype * ty
   fn_fail:
     goto fn_exit;
 }
+
+/* -- internal static routines -- */
+
+/* BFloat16 - software arithemetics
+ * TODO: add hardware support, e.g. via AVX512 intrinsics
+ */
+static float bfloat16_load(void *p)
+{
+    uint32_t u = ((uint32_t) (*(uint16_t *) p) << 16);
+    float v;
+    memcpy(&v, &u, sizeof(float));
+    return v;
+}
+
+static void bfloat16_store(void *p, float v)
+{
+    uint32_t u;
+    memcpy(&u, &v, sizeof(float));
+    if (u & 0x8000) {
+        /* round up */
+        *(uint16_t *) p = (u >> 16) + 1;
+    } else {
+        /* truncation */
+        *(uint16_t *) p = (u >> 16);
+    }
+
+}
+
+static void bfloat16_sum(void *invec, void *inoutvec, MPI_Aint len)
+{
+    for (MPI_Aint i = 0; i < len * 2; i += 2) {
+        float a = bfloat16_load((char *) inoutvec + i);
+        float b = bfloat16_load((char *) invec + i);
+        bfloat16_store((char *) inoutvec + i, a + b);
+    }
+}
+
+/* IEEE half-precision 16-bit float - software arithemetics
+ */
+static float f16_load(void *p)
+{
+    uint16_t a = *(uint16_t *) p;
+    /* expand exponent from 5 bit to 8 bit, fraction from 10 bit to 23 bit */
+    uint32_t u = ((uint32_t) ((a & 0x8000) | ((((a & 0x3c00) >> 10) + 0x70) << 7)) << 16) |
+        ((uint32_t) (a & 0x3ff) << 13);
+    float v;
+    memcpy(&v, &u, sizeof(float));
+    return v;
+}
+
+static void f16_store(void *p, float v)
+{
+    uint32_t u;
+    memcpy(&u, &v, sizeof(float));
+    /* shrink exponent from 8 bit to 5 bit, fraction from 23 bit to 10 bit */
+    uint16_t a = ((u & 0x80000000) >> 16) | ((((u & 0x7f800000) >> 23) - 0x70) << 10) |
+        ((u & 0x7fffff) >> 16);
+    if (u & 0x1000) {
+        /* round up */
+        a += 1;
+    }
+    *(uint16_t *) p = a;
+}
+
+static void f16_sum(void *invec, void *inoutvec, MPI_Aint len)
+{
+    for (MPI_Aint i = 0; i < len * 2; i += 2) {
+        float a = f16_load((char *) inoutvec + i);
+        float b = f16_load((char *) invec + i);
+        f16_store((char *) inoutvec + i, a + b);
+    }
+}
@@ -30,6 +30,7 @@ yaksa_type_t MPII_Typerep_get_yaksa_type(MPI_Datatype type)
 
     switch (MPIR_DATATYPE_GET_RAW_INTERNAL(type)) {
         case MPIR_INT8:
+        case MPIR_FORTRAN_LOGICAL8:
             yaksa_type = YAKSA_TYPE__INT8_T;
             break;
 
@@ -43,6 +44,7 @@ yaksa_type_t MPII_Typerep_get_yaksa_type(MPI_Datatype type)
             break;
 
         case MPIR_INT16:
+        case MPIR_FORTRAN_LOGICAL16:
             yaksa_type = YAKSA_TYPE__INT16_T;
             break;
 
@@ -52,11 +54,13 @@ yaksa_type_t MPII_Typerep_get_yaksa_type(MPI_Datatype type)
 
         case MPIR_FIXED16:
         case MPIR_FLOAT16:
+        case MPIR_BFLOAT16:
         case MPIR_COMPLEX8:
             yaksa_type = TYPEREP_YAKSA_TYPE__FIXED2;
             break;
 
         case MPIR_INT32:
+        case MPIR_FORTRAN_LOGICAL32:
             yaksa_type = YAKSA_TYPE__INT32_T;
             break;
 
@@ -70,6 +74,7 @@ yaksa_type_t MPII_Typerep_get_yaksa_type(MPI_Datatype type)
             break;
 
         case MPIR_INT64:
+        case MPIR_FORTRAN_LOGICAL64:
             yaksa_type = YAKSA_TYPE__INT64_T;
             break;
 
@@ -111,6 +116,7 @@ yaksa_type_t MPII_Typerep_get_yaksa_type(MPI_Datatype type)
         case MPIR_INT128:
         case MPIR_UINT128:
         case MPIR_FLOAT128:
+        case MPIR_FORTRAN_LOGICAL128:
             yaksa_type = TYPEREP_YAKSA_TYPE__FIXED16;
             break;
 
 
@@ -75,8 +75,8 @@ typedef struct {
     } while (0)
 
 /* long double */
-#ifdef HAVE_FLOAT128
-#define EXTERNAL_LONG_DOUBLE_TYPE __float128
+#ifdef MPIR_FLOAT128_CTYPE
+#define EXTERNAL_LONG_DOUBLE_TYPE MPIR_FLOAT128_CTYPE
 #else
 #define EXTERNAL_LONG_DOUBLE_TYPE long double
 #endif
 
@@ -98,6 +98,12 @@ struct MPIR_Datatype_builtin_entry MPIR_Internal_types[] = {
     type_name_entry(OFFSET,             MULTI),           /* 0x44 */
     type_name_entry(COUNT,              MULTI),           /* 0x45 */
     type_name_x(C_FLOAT16,              FLOATING_POINT),  /* 0x46 */
+    type_name_entry(LOGICAL1,           LOGICAL),         /* 0x47 */
+    type_name_entry(LOGICAL2,           LOGICAL),         /* 0x48 */
+    type_name_entry(LOGICAL4,           LOGICAL),         /* 0x49 */
+    type_name_entry(LOGICAL8,           LOGICAL),         /* 0x4a */
+    type_name_entry(LOGICAL16,          LOGICAL),         /* 0x4b */
+    type_name_x(BFLOAT16,               FLOATING_POINT),  /* 0x4c */
     /* *INDENT-ON* */
 };
 
@@ -158,34 +164,61 @@ int MPIR_Datatype_builtintype_alignment(MPI_Datatype type)
         case MPIR_FIXED8:
         case MPIR_INT8:
         case MPIR_UINT8:
-        case MPIR_FLOAT8:
-            return ALIGNOF_INT8_T;
+        case MPIR_COMPLEX8:
+        case MPIR_FORTRAN_LOGICAL8:
+            return MPIR_INT8_ALIGN;
         case MPIR_FIXED16:
         case MPIR_INT16:
         case MPIR_UINT16:
-        case MPIR_FLOAT16:
-            return ALIGNOF_INT16_T;
+        case MPIR_FORTRAN_LOGICAL16:
+            return MPIR_INT16_ALIGN;
         case MPIR_FIXED32:
         case MPIR_INT32:
         case MPIR_UINT32:
-            return ALIGNOF_INT32_T;
+        case MPIR_FORTRAN_LOGICAL32:
+            return MPIR_INT32_ALIGN;
         case MPIR_FIXED64:
         case MPIR_INT64:
         case MPIR_UINT64:
-            return ALIGNOF_INT64_T;
+        case MPIR_FORTRAN_LOGICAL64:
+            return MPIR_INT64_ALIGN;
+#ifdef MPIR_INT128_ALIGN
+        case MPIR_FIXED128:
+        case MPIR_INT128:
+        case MPIR_UINT128:
+        case MPIR_FORTRAN_LOGICAL128:
+            return MPIR_INT128_ALIGN;
+#endif
+#ifdef MPIR_FLOAT16_ALIGN
+        case MPIR_FLOAT16:
+        case MPIR_COMPLEX16:
+        case MPIR_BFLOAT16:
+            return MPIR_FLOAT16_ALIGN;
+#endif
         case MPIR_FLOAT32:
         case MPIR_COMPLEX32:
-            return ALIGNOF_FLOAT;
+            return MPIR_FLOAT32_ALIGN;
         case MPIR_FLOAT64:
         case MPIR_COMPLEX64:
-            return ALIGNOF_DOUBLE;
+            return MPIR_FLOAT64_ALIGN;
+#ifdef MPIR_FLOAT128_ALIGN
+        case MPIR_FLOAT128:
+        case MPIR_COMPLEX128:
+            return MPIR_FLOAT128_ALIGN;
+#endif
+#ifdef MPIR_ALT_FLOAT96_ALIGN
         case MPIR_ALT_FLOAT96:
-        case MPIR_ALT_FLOAT128:
         case MPIR_ALT_COMPLEX96:
+            return MPIR_ALT_FLOAT96_ALIGN;
+#endif
+#ifdef MPIR_ALT_FLOAT128_ALIGN
         case MPIR_ALT_COMPLEX128:
-            return ALIGNOF_LONG_DOUBLE;
+        case MPIR_ALT_FLOAT128:
+            return MPIR_ALT_FLOAT128_ALIGN;
+#endif
         default:
-            /* handle error cases? */
+            /* FIXME: throw error */
+            MPIR_Assert(0);
             return 1;
     }
 }