raspberrypi · kilograham · Feb 4, 2025 · Jan 23, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/src/rp2_common/pico_double/double_aeabi_dcp.S b/src/rp2_common/pico_double/double_aeabi_dcp.S
@@ -128,53 +128,131 @@ saving_func wrapper sqrt
   dcp_dsqrt_m r0,r1,r0,r1,r0,r1,r2,r3,r12
   saving_func_return
 
-// todo not a real thing
-double_wrapper_section __aeabi_dclassify
-saving_func wrapper __aeabi_dclassify
+double_section dclassify
+saving_func regular dclassify
 @ with correct rounding
   dcp_dclassify_m apsr_nzcv,r0,r1
   saving_func_return
 
 // ============== CONVERSION FUNCTIONS ===============
 
 double_wrapper_section __aeabi_d2f
+regular_func double2float
 saving_func wrapper __aeabi_d2f
 @ with rounding
   dcp_double2float_m r0,r0,r1
   saving_func_return
 
 double_wrapper_section __aeabi_i2d
+regular_func int2double
 saving_func wrapper __aeabi_i2d
   dcp_int2double_m r0,r1,r0
   saving_func_return
 
 double_wrapper_section __aeabi_ui2d
+regular_func uint2double
 saving_func wrapper __aeabi_ui2d
   dcp_uint2double_m r0,r1,r0
   saving_func_return
 
+double_section double2fix_z
+saving_func regular double2fix_z
+  ubfx r3, r1, #20, #11
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  movpl r3, #0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2int_z_entry
+
+double_section double2ufix
+regular_func double2ufix
+saving_func regular double2ufix_z
+double2ufix_z_entry:
+  ubfx r3, r1, #20, #11
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  lsrspl r3, r1, #20 // 0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2uint_z_entry
+
+double_section double2fix
+saving_func regular double2fix
+  ubfx r3, r1, #20, #11
+  adds r3, r2
+  beq 1f // very small; we don't care that we might make a denormal
+  asrs ip, r3, #11
+  beq 1f
+  ite pl
+  movpl r3, #0x7ff
+  movsmi r3, #0
+1:
+  bfi r1, r3, #20, #11
+  b double2int_entry
+
+double_section double2int
+saving_func regular double2int
+double2int_entry:
+  lsls r2, r1, #1
+  // r1 = abs(zero)                   => r1 = 0x00000000
+  // r1 = abs(denornaml)              => r1 = 0x00.xxxxx
+  // r1 = abs(1.0f)                   => r1 = 0x7f800000
+  // r1 = abs(inf/nan)                => r1 = 0xffXxxxxx
+  bcc double2int_z_entry // positive is ok for int64_z
+  orrs r3, r2, r0
+  beq double2int_z_entry // 0 or -0 is ok for int64_z
+  // r3 = last 3 bits of 23 significant bits of mantissa at position 32-23
+  lsrs r3, r0, #32 - 3
+  lsls r3, #9
+
+  lsrs r2, #21
+  adds r2, #1
+  subs r2, r2, #0x400
+  bcc 1f // <1 means subtract 1
+  // recreate the 23 significant bits of mantissa for float at the top of r3
+  adds r3, r3, r1, lsl #12
+  lsls r3, r2
+  beq double2int_z_entry // integer
+1:
+  dcp_double2int_m r0,r0,r1
+  subs r0, #1
+saving_func_return
+
 double_wrapper_section __aeabi_d2iz
+regular_func double2int_z
 saving_func wrapper __aeabi_d2iz
+double2int_z_entry:
 @ with truncation towards 0
   dcp_double2int_m r0,r0,r1
+  // note: this works with either saved or not saved call as it is just a `bx lr`
   saving_func_return
 
 double_wrapper_section __aeabi_d2uiz
+regular_func double2uint_z
+regular_func double2uint
 saving_func wrapper __aeabi_d2uiz
+double2uint_z_entry:
 @ with truncation towards 0
   dcp_double2uint_m r0,r0,r1
   saving_func_return
 
-// todo not a real thing
-double_wrapper_section __aeabi_d2i_r
-saving_func wrapper __aeabi_d2i_r
+double_section double2int_r
+saving_func regular double2int_r
 @ with rounding
   dcp_double2int_r_m r0,r0,r1
   saving_func_return
 
-// todo not a real thing
-double_wrapper_section __aeabi_d2ui_r
-saving_func wrapper __aeabi_d2ui_r
+double_section double2uint_r
+saving_func regular double2uint_r
 @ with rounding
   dcp_double2uint_r_m r0,r0,r1
   saving_func_return

diff --git a/src/rp2_common/pico_double/double_aeabi_rp2040.S b/src/rp2_common/pico_double/double_aeabi_rp2040.S
@@ -425,13 +425,15 @@ double_wrapper_section __aeabi_ui2d
 double_wrapper_section __aeabi_i2d
 
 wrapper_func __aeabi_ui2d
+regular_func uint2double
     movs r1, #0
     cmp r0, #0
     bne 2f
 1:
     bx lr
 // double FUNC_NAME(__aeabi_i2d)(int)                     integer to double (double precision) conversion
 wrapper_func __aeabi_i2d
+regular_func int2double
     asrs r1, r0, #31
     eors r0, r1
     subs r0, r1
@@ -506,6 +508,7 @@ regular_func double2int
 // unsigned FUNC_NAME(__aeabi_d2uiz)(double)             double (double precision) to unsigned C-style conversion [3]
 double_wrapper_section __aeabi_d2uiz
 wrapper_func __aeabi_d2uiz
+regular_func double2uint_z
 regular_func double2uint
     shimmable_table_tail_call SF_TABLE_FLOAT2UINT double2uint_shim
 
@@ -528,11 +531,13 @@ regular_func ufix642double
 // double FUNC_NAME(__aeabi_l2d)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_l2d
 wrapper_func __aeabi_l2d
+regular_func int642double
     shimmable_table_tail_call SF_TABLE_INT642FLOAT int642double_shim
 
 // double FUNC_NAME(__aeabi_l2f)(long long)             long long to double (double precision) conversion
 double_wrapper_section __aeabi_ul2d
 wrapper_func __aeabi_ul2d
+regular_func uint642double
     shimmable_table_tail_call SF_TABLE_UINT642FLOAT uint642double_shim
 
 // long long FUNC_NAME(__aeabi_d2lz)(double)             double (double precision) to long long C-style conversion [3]
@@ -566,14 +571,55 @@ regular_func double2int64
 // unsigned long long FUNC_NAME(__aeabi_d2ulz)(double)     double to unsigned long long C-style conversion [3]
 double_wrapper_section __aeabi_d2ulz
 wrapper_func __aeabi_d2ulz
+regular_func double2uint64
+regular_func double2uint64_z
     shimmable_table_tail_call SF_TABLE_FLOAT2UINT64 double2uint64_shim
 
+double_section double2fix64_z
+regular_func double2fix64_z
+  lsls r3, r1, #1
+  bcc double2fix64 // positive is ok for fix64
+  push {r1, r2}
+  lsrs r3, #21
+
+  adds r3, r2
+  movs r2, #1
+  negs r2, r2
+  lsrs r2, #22
+  subs r3, r2
+
+  bcc 1f // <1 means subtract 1
+  cmp r3, #52
+  bge double2fix64 // must be an integer
+
+  lsls r2, r1, #12
+  lsrs r1, r0, #20
+  adds r2, r1
+  // r2 now has highest 32 mantissa bits
+  lsls r2, r3
+  bne 1f // not integer as non zero fractional bits remain
+  lsls r2, r0, #12
+  lsls r2, r3
+  bne 1f
+  // integer
+  pop {r1, r2}
+  b double2fix64
+1:
+  pop {r1, r2}
+  push {lr}
+  bl double2fix64
+  movs r2, #0
+  adds r0, #1
+  adcs r1, r2
+  pop {pc}
+
 double_section double2fix64
 regular_func double2fix64
     shimmable_table_tail_call SF_TABLE_FLOAT2FIX64 double2fix64_shim
 
 double_section double2ufix64
 regular_func double2ufix64
+regular_func double2ufix64_z
     shimmable_table_tail_call SF_TABLE_FLOAT2UFIX64 double2ufix64_shim
 
 double_section double2fix
@@ -582,6 +628,7 @@ regular_func double2fix
 
 double_section double2ufix
 regular_func double2ufix
+regular_func double2ufix_z
     shimmable_table_tail_call SF_TABLE_FLOAT2UFIX double2ufix_shim
 
 double_wrapper_section __aeabi_d2f

diff --git a/src/rp2_common/pico_double/double_conv_m33.S b/src/rp2_common/pico_double/double_conv_m33.S
@@ -249,7 +249,61 @@ regular_func ufix2double
  movs r1,#0
  bx r14
 
-double_wrapper_section conv_dtoi64
+double_section conv_dtoi64
+regular_func double2int64
+  lsls r2, r1, #1
+  bcc double2int64_z // positive is ok for int64_z
+  orrs r3, r2, r0
+  beq double2int64_z // 0 or -0 is ok for int64_z
+
+  lsrs r2, #21
+  adds r2, #1
+  subs r2, r2, #0x400
+  bcc 1f // <1 means subtract 1
+  cmp r2, #52
+  bge double2int64_z // must be an integer
+  lsls r3, r1, #12
+  adds r3, r3, r0, lsr #20
+  // r3 now has highest 32 mantissa bits
+  lsls r3, r2
+  bne 1f // not integer as non zero fractional bits remain
+  lsls r3, r0, #12
+  lsls r3, r2
+  beq double2int64_z // integer
+1:
+  push {lr}
+  bl double2int64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+
+double_section conv_dtofix64
+regular_func double2fix64
+  lsls r3, r1, #1
+  bcc double2fix64_z // positive is ok for int64_z
+  lsrs r3, #21
+  adds r3, #1
+  rsb ip, r2, #0x400
+  subs r3, ip
+  bcc 1f // <1 means subtract 1
+  cmp r3, #52
+  bge double2fix64_z // must be an integer
+  lsls ip, r1, #12
+  adds ip, ip, r0, lsr #20
+  // ip now has highest 32 mantissa bits
+  lsls ip, r3
+  bne 1f // not integer as non zero fractional bits remain
+  lsls ip, r0, #12
+  lsls ip, r3
+  beq double2fix64_z // integer
+1:
+  push {lr}
+  bl double2fix64_z
+  subs r0, #1
+  sbcs r1, r1, #0
+  pop {pc}
+
+double_wrapper_section conv_dtoi64_z
 
 @ convert double to signed int64, rounding towards 0, clamping
 wrapper_func __aeabi_d2lz

diff --git a/src/rp2_common/pico_double/double_fma_dcp.S b/src/rp2_common/pico_double/double_fma_dcp.S
@@ -582,7 +582,7 @@ wrapper_func fma
  saving_func_return
 
 
-double_wrapper_section __dmla
+double_section fma_fast
 @ cf saving_func macro: but here we need to record the SP before the state save possibly changes it
 1:
  push {lr}              // 16-bit instruction
@@ -592,6 +592,7 @@ double_wrapper_section __dmla
 @ r0:r1 m
 @ r2:r3 n
 @ [r13,#0] a
+regular_func fma_fast
 regular_func mla
  mov r12,sp                  @ save the SP
  PCMP apsr_nzcv              @ test the engaged flag