Skip to content

Commit 68ad926

Browse files
committed
target/mips: Use 8-byte memory ops for msa load/store
Rather than use 4-16 separate operations, use 2 operations plus some byte reordering as necessary. Reviewed-by: Philippe Mathieu-Daudé <[email protected]> Signed-off-by: Richard Henderson <[email protected]>
1 parent 948f886 commit 68ad926

File tree

1 file changed

+71
-130
lines changed

1 file changed

+71
-130
lines changed

target/mips/tcg/msa_helper.c

Lines changed: 71 additions & 130 deletions
Original file line numberDiff line numberDiff line change
@@ -8218,103 +8218,86 @@ void helper_msa_ffint_u_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
82188218
#define MEMOP_IDX(DF)
82198219
#endif
82208220

8221+
#ifdef TARGET_WORDS_BIGENDIAN
8222+
static inline uint64_t bswap16x4(uint64_t x)
8223+
{
8224+
uint64_t m = 0x00ff00ff00ff00ffull;
8225+
return ((x & m) << 8) | ((x >> 8) & m);
8226+
}
8227+
8228+
static inline uint64_t bswap32x2(uint64_t x)
8229+
{
8230+
return ror64(bswap64(x), 32);
8231+
}
8232+
#endif
8233+
82218234
void helper_msa_ld_b(CPUMIPSState *env, uint32_t wd,
82228235
target_ulong addr)
82238236
{
82248237
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
82258238
uintptr_t ra = GETPC();
8239+
uint64_t d0, d1;
82268240

8227-
#if !defined(HOST_WORDS_BIGENDIAN)
8228-
pwd->b[0] = cpu_ldub_data_ra(env, addr + (0 << DF_BYTE), ra);
8229-
pwd->b[1] = cpu_ldub_data_ra(env, addr + (1 << DF_BYTE), ra);
8230-
pwd->b[2] = cpu_ldub_data_ra(env, addr + (2 << DF_BYTE), ra);
8231-
pwd->b[3] = cpu_ldub_data_ra(env, addr + (3 << DF_BYTE), ra);
8232-
pwd->b[4] = cpu_ldub_data_ra(env, addr + (4 << DF_BYTE), ra);
8233-
pwd->b[5] = cpu_ldub_data_ra(env, addr + (5 << DF_BYTE), ra);
8234-
pwd->b[6] = cpu_ldub_data_ra(env, addr + (6 << DF_BYTE), ra);
8235-
pwd->b[7] = cpu_ldub_data_ra(env, addr + (7 << DF_BYTE), ra);
8236-
pwd->b[8] = cpu_ldub_data_ra(env, addr + (8 << DF_BYTE), ra);
8237-
pwd->b[9] = cpu_ldub_data_ra(env, addr + (9 << DF_BYTE), ra);
8238-
pwd->b[10] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
8239-
pwd->b[11] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
8240-
pwd->b[12] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
8241-
pwd->b[13] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
8242-
pwd->b[14] = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
8243-
pwd->b[15] = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
8244-
#else
8245-
pwd->b[0] = cpu_ldub_data_ra(env, addr + (7 << DF_BYTE), ra);
8246-
pwd->b[1] = cpu_ldub_data_ra(env, addr + (6 << DF_BYTE), ra);
8247-
pwd->b[2] = cpu_ldub_data_ra(env, addr + (5 << DF_BYTE), ra);
8248-
pwd->b[3] = cpu_ldub_data_ra(env, addr + (4 << DF_BYTE), ra);
8249-
pwd->b[4] = cpu_ldub_data_ra(env, addr + (3 << DF_BYTE), ra);
8250-
pwd->b[5] = cpu_ldub_data_ra(env, addr + (2 << DF_BYTE), ra);
8251-
pwd->b[6] = cpu_ldub_data_ra(env, addr + (1 << DF_BYTE), ra);
8252-
pwd->b[7] = cpu_ldub_data_ra(env, addr + (0 << DF_BYTE), ra);
8253-
pwd->b[8] = cpu_ldub_data_ra(env, addr + (15 << DF_BYTE), ra);
8254-
pwd->b[9] = cpu_ldub_data_ra(env, addr + (14 << DF_BYTE), ra);
8255-
pwd->b[10] = cpu_ldub_data_ra(env, addr + (13 << DF_BYTE), ra);
8256-
pwd->b[11] = cpu_ldub_data_ra(env, addr + (12 << DF_BYTE), ra);
8257-
pwd->b[12] = cpu_ldub_data_ra(env, addr + (11 << DF_BYTE), ra);
8258-
pwd->b[13] = cpu_ldub_data_ra(env, addr + (10 << DF_BYTE), ra);
8259-
pwd->b[14] = cpu_ldub_data_ra(env, addr + (9 << DF_BYTE), ra);
8260-
pwd->b[15] = cpu_ldub_data_ra(env, addr + (8 << DF_BYTE), ra);
8261-
#endif
8241+
/* Load 8 bytes at a time. Vector element ordering makes this LE. */
8242+
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
8243+
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
8244+
pwd->d[0] = d0;
8245+
pwd->d[1] = d1;
82628246
}
82638247

82648248
void helper_msa_ld_h(CPUMIPSState *env, uint32_t wd,
82658249
target_ulong addr)
82668250
{
82678251
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
82688252
uintptr_t ra = GETPC();
8253+
uint64_t d0, d1;
82698254

8270-
#if !defined(HOST_WORDS_BIGENDIAN)
8271-
pwd->h[0] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
8272-
pwd->h[1] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
8273-
pwd->h[2] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
8274-
pwd->h[3] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
8275-
pwd->h[4] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
8276-
pwd->h[5] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
8277-
pwd->h[6] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
8278-
pwd->h[7] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
8279-
#else
8280-
pwd->h[0] = cpu_lduw_data_ra(env, addr + (3 << DF_HALF), ra);
8281-
pwd->h[1] = cpu_lduw_data_ra(env, addr + (2 << DF_HALF), ra);
8282-
pwd->h[2] = cpu_lduw_data_ra(env, addr + (1 << DF_HALF), ra);
8283-
pwd->h[3] = cpu_lduw_data_ra(env, addr + (0 << DF_HALF), ra);
8284-
pwd->h[4] = cpu_lduw_data_ra(env, addr + (7 << DF_HALF), ra);
8285-
pwd->h[5] = cpu_lduw_data_ra(env, addr + (6 << DF_HALF), ra);
8286-
pwd->h[6] = cpu_lduw_data_ra(env, addr + (5 << DF_HALF), ra);
8287-
pwd->h[7] = cpu_lduw_data_ra(env, addr + (4 << DF_HALF), ra);
8255+
/*
8256+
* Load 8 bytes at a time. Use little-endian load, then for
8257+
* big-endian target, we must then swap the four halfwords.
8258+
*/
8259+
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
8260+
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
8261+
#ifdef TARGET_WORDS_BIGENDIAN
8262+
d0 = bswap16x4(d0);
8263+
d1 = bswap16x4(d1);
82888264
#endif
8265+
pwd->d[0] = d0;
8266+
pwd->d[1] = d1;
82898267
}
82908268

82918269
void helper_msa_ld_w(CPUMIPSState *env, uint32_t wd,
82928270
target_ulong addr)
82938271
{
82948272
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
82958273
uintptr_t ra = GETPC();
8274+
uint64_t d0, d1;
82968275

8297-
#if !defined(HOST_WORDS_BIGENDIAN)
8298-
pwd->w[0] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
8299-
pwd->w[1] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
8300-
pwd->w[2] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
8301-
pwd->w[3] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
8302-
#else
8303-
pwd->w[0] = cpu_ldl_data_ra(env, addr + (1 << DF_WORD), ra);
8304-
pwd->w[1] = cpu_ldl_data_ra(env, addr + (0 << DF_WORD), ra);
8305-
pwd->w[2] = cpu_ldl_data_ra(env, addr + (3 << DF_WORD), ra);
8306-
pwd->w[3] = cpu_ldl_data_ra(env, addr + (2 << DF_WORD), ra);
8276+
/*
8277+
* Load 8 bytes at a time. Use little-endian load, then for
8278+
* big-endian target, we must then bswap the two words.
8279+
*/
8280+
d0 = cpu_ldq_le_data_ra(env, addr + 0, ra);
8281+
d1 = cpu_ldq_le_data_ra(env, addr + 8, ra);
8282+
#ifdef TARGET_WORDS_BIGENDIAN
8283+
d0 = bswap32x2(d0);
8284+
d1 = bswap32x2(d1);
83078285
#endif
8286+
pwd->d[0] = d0;
8287+
pwd->d[1] = d1;
83088288
}
83098289

83108290
void helper_msa_ld_d(CPUMIPSState *env, uint32_t wd,
83118291
target_ulong addr)
83128292
{
83138293
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
83148294
uintptr_t ra = GETPC();
8295+
uint64_t d0, d1;
83158296

8316-
pwd->d[0] = cpu_ldq_data_ra(env, addr + (0 << DF_DOUBLE), ra);
8317-
pwd->d[1] = cpu_ldq_data_ra(env, addr + (1 << DF_DOUBLE), ra);
8297+
d0 = cpu_ldq_data_ra(env, addr + 0, ra);
8298+
d1 = cpu_ldq_data_ra(env, addr + 8, ra);
8299+
pwd->d[0] = d0;
8300+
pwd->d[1] = d1;
83188301
}
83198302

83208303
#define MSA_PAGESPAN(x) \
@@ -8344,41 +8327,9 @@ void helper_msa_st_b(CPUMIPSState *env, uint32_t wd,
83448327

83458328
ensure_writable_pages(env, addr, mmu_idx, ra);
83468329

8347-
#if !defined(HOST_WORDS_BIGENDIAN)
8348-
cpu_stb_data_ra(env, addr + (0 << DF_BYTE), pwd->b[0], ra);
8349-
cpu_stb_data_ra(env, addr + (1 << DF_BYTE), pwd->b[1], ra);
8350-
cpu_stb_data_ra(env, addr + (2 << DF_BYTE), pwd->b[2], ra);
8351-
cpu_stb_data_ra(env, addr + (3 << DF_BYTE), pwd->b[3], ra);
8352-
cpu_stb_data_ra(env, addr + (4 << DF_BYTE), pwd->b[4], ra);
8353-
cpu_stb_data_ra(env, addr + (5 << DF_BYTE), pwd->b[5], ra);
8354-
cpu_stb_data_ra(env, addr + (6 << DF_BYTE), pwd->b[6], ra);
8355-
cpu_stb_data_ra(env, addr + (7 << DF_BYTE), pwd->b[7], ra);
8356-
cpu_stb_data_ra(env, addr + (8 << DF_BYTE), pwd->b[8], ra);
8357-
cpu_stb_data_ra(env, addr + (9 << DF_BYTE), pwd->b[9], ra);
8358-
cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[10], ra);
8359-
cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[11], ra);
8360-
cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[12], ra);
8361-
cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[13], ra);
8362-
cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[14], ra);
8363-
cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[15], ra);
8364-
#else
8365-
cpu_stb_data_ra(env, addr + (7 << DF_BYTE), pwd->b[0], ra);
8366-
cpu_stb_data_ra(env, addr + (6 << DF_BYTE), pwd->b[1], ra);
8367-
cpu_stb_data_ra(env, addr + (5 << DF_BYTE), pwd->b[2], ra);
8368-
cpu_stb_data_ra(env, addr + (4 << DF_BYTE), pwd->b[3], ra);
8369-
cpu_stb_data_ra(env, addr + (3 << DF_BYTE), pwd->b[4], ra);
8370-
cpu_stb_data_ra(env, addr + (2 << DF_BYTE), pwd->b[5], ra);
8371-
cpu_stb_data_ra(env, addr + (1 << DF_BYTE), pwd->b[6], ra);
8372-
cpu_stb_data_ra(env, addr + (0 << DF_BYTE), pwd->b[7], ra);
8373-
cpu_stb_data_ra(env, addr + (15 << DF_BYTE), pwd->b[8], ra);
8374-
cpu_stb_data_ra(env, addr + (14 << DF_BYTE), pwd->b[9], ra);
8375-
cpu_stb_data_ra(env, addr + (13 << DF_BYTE), pwd->b[10], ra);
8376-
cpu_stb_data_ra(env, addr + (12 << DF_BYTE), pwd->b[11], ra);
8377-
cpu_stb_data_ra(env, addr + (11 << DF_BYTE), pwd->b[12], ra);
8378-
cpu_stb_data_ra(env, addr + (10 << DF_BYTE), pwd->b[13], ra);
8379-
cpu_stb_data_ra(env, addr + (9 << DF_BYTE), pwd->b[14], ra);
8380-
cpu_stb_data_ra(env, addr + (8 << DF_BYTE), pwd->b[15], ra);
8381-
#endif
8330+
/* Store 8 bytes at a time. Vector element ordering makes this LE. */
8331+
cpu_stq_le_data_ra(env, addr + 0, pwd->d[0], ra);
8332+
cpu_stq_le_data_ra(env, addr + 0, pwd->d[1], ra);
83828333
}
83838334

83848335
void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
@@ -8387,28 +8338,19 @@ void helper_msa_st_h(CPUMIPSState *env, uint32_t wd,
83878338
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
83888339
int mmu_idx = cpu_mmu_index(env, false);
83898340
uintptr_t ra = GETPC();
8341+
uint64_t d0, d1;
83908342

83918343
ensure_writable_pages(env, addr, mmu_idx, ra);
83928344

8393-
#if !defined(HOST_WORDS_BIGENDIAN)
8394-
cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[0], ra);
8395-
cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[1], ra);
8396-
cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[2], ra);
8397-
cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[3], ra);
8398-
cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[4], ra);
8399-
cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[5], ra);
8400-
cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[6], ra);
8401-
cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[7], ra);
8402-
#else
8403-
cpu_stw_data_ra(env, addr + (3 << DF_HALF), pwd->h[0], ra);
8404-
cpu_stw_data_ra(env, addr + (2 << DF_HALF), pwd->h[1], ra);
8405-
cpu_stw_data_ra(env, addr + (1 << DF_HALF), pwd->h[2], ra);
8406-
cpu_stw_data_ra(env, addr + (0 << DF_HALF), pwd->h[3], ra);
8407-
cpu_stw_data_ra(env, addr + (7 << DF_HALF), pwd->h[4], ra);
8408-
cpu_stw_data_ra(env, addr + (6 << DF_HALF), pwd->h[5], ra);
8409-
cpu_stw_data_ra(env, addr + (5 << DF_HALF), pwd->h[6], ra);
8410-
cpu_stw_data_ra(env, addr + (4 << DF_HALF), pwd->h[7], ra);
8345+
/* Store 8 bytes at a time. See helper_msa_ld_h. */
8346+
d0 = pwd->d[0];
8347+
d1 = pwd->d[1];
8348+
#ifdef TARGET_WORDS_BIGENDIAN
8349+
d0 = bswap16x4(d0);
8350+
d1 = bswap16x4(d1);
84118351
#endif
8352+
cpu_stq_le_data_ra(env, addr + 0, d0, ra);
8353+
cpu_stq_le_data_ra(env, addr + 8, d1, ra);
84128354
}
84138355

84148356
void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
@@ -8417,20 +8359,19 @@ void helper_msa_st_w(CPUMIPSState *env, uint32_t wd,
84178359
wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
84188360
int mmu_idx = cpu_mmu_index(env, false);
84198361
uintptr_t ra = GETPC();
8362+
uint64_t d0, d1;
84208363

84218364
ensure_writable_pages(env, addr, mmu_idx, ra);
84228365

8423-
#if !defined(HOST_WORDS_BIGENDIAN)
8424-
cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[0], ra);
8425-
cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[1], ra);
8426-
cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[2], ra);
8427-
cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[3], ra);
8428-
#else
8429-
cpu_stl_data_ra(env, addr + (1 << DF_WORD), pwd->w[0], ra);
8430-
cpu_stl_data_ra(env, addr + (0 << DF_WORD), pwd->w[1], ra);
8431-
cpu_stl_data_ra(env, addr + (3 << DF_WORD), pwd->w[2], ra);
8432-
cpu_stl_data_ra(env, addr + (2 << DF_WORD), pwd->w[3], ra);
8366+
/* Store 8 bytes at a time. See helper_msa_ld_w. */
8367+
d0 = pwd->d[0];
8368+
d1 = pwd->d[1];
8369+
#ifdef TARGET_WORDS_BIGENDIAN
8370+
d0 = bswap32x2(d0);
8371+
d1 = bswap32x2(d1);
84338372
#endif
8373+
cpu_stq_le_data_ra(env, addr + 0, d0, ra);
8374+
cpu_stq_le_data_ra(env, addr + 8, d1, ra);
84348375
}
84358376

84368377
void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
@@ -8442,6 +8383,6 @@ void helper_msa_st_d(CPUMIPSState *env, uint32_t wd,
84428383

84438384
ensure_writable_pages(env, addr, mmu_idx, GETPC());
84448385

8445-
cpu_stq_data_ra(env, addr + (0 << DF_DOUBLE), pwd->d[0], ra);
8446-
cpu_stq_data_ra(env, addr + (1 << DF_DOUBLE), pwd->d[1], ra);
8386+
cpu_stq_data_ra(env, addr + 0, pwd->d[0], ra);
8387+
cpu_stq_data_ra(env, addr + 8, pwd->d[1], ra);
84478388
}

0 commit comments

Comments
 (0)