@@ -30,6 +30,126 @@ namespace lsp
30
30
{
31
31
namespace asimd
32
32
{
33
+ IF_ARCH_AARCH64 (
34
+ static const uint32_t rgba32_to_bgra32_const[] __lsp_aligned16 =
35
+ {
36
+ LSP_DSP_VEC4 (0x00ff00ff ),
37
+ LSP_DSP_VEC4 (0x00ff00ff ),
38
+ };
39
+ );
40
+
41
+ void rgba32_to_bgra32 (void *dst, const void *src, size_t count)
42
+ {
43
+ ARCH_AARCH64_ASM (
44
+ __ASM_EMIT (" ldp q16, q17, [%[XC]]" )
45
+
46
+ // 32x blocks
47
+ __ASM_EMIT (" subs %[count], %[count], #32" )
48
+ __ASM_EMIT (" b.lo 2f" )
49
+ __ASM_EMIT (" 1:" )
50
+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
51
+ __ASM_EMIT (" ldp q2, q3, [%[src], 0x20]" )
52
+ __ASM_EMIT (" ldp q4, q5, [%[src], 0x40]" )
53
+ __ASM_EMIT (" ldp q6, q7, [%[src], 0x60]" )
54
+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
55
+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
56
+ __ASM_EMIT (" rev32 v10.8h, v2.8h" )
57
+ __ASM_EMIT (" rev32 v11.8h, v3.8h" )
58
+ __ASM_EMIT (" rev32 v12.8h, v4.8h" )
59
+ __ASM_EMIT (" rev32 v13.8h, v5.8h" )
60
+ __ASM_EMIT (" rev32 v14.8h, v6.8h" )
61
+ __ASM_EMIT (" rev32 v15.8h, v7.8h" )
62
+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
63
+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
64
+ __ASM_EMIT (" bit v2.16b, v10.16b, v16.16b" )
65
+ __ASM_EMIT (" bit v3.16b, v11.16b, v17.16b" )
66
+ __ASM_EMIT (" bit v4.16b, v12.16b, v16.16b" )
67
+ __ASM_EMIT (" bit v5.16b, v13.16b, v17.16b" )
68
+ __ASM_EMIT (" bit v6.16b, v14.16b, v16.16b" )
69
+ __ASM_EMIT (" bit v7.16b, v15.16b, v17.16b" )
70
+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
71
+ __ASM_EMIT (" stp q2, q3, [%[dst], 0x20]" )
72
+ __ASM_EMIT (" stp q4, q5, [%[dst], 0x40]" )
73
+ __ASM_EMIT (" stp q6, q7, [%[dst], 0x60]" )
74
+ __ASM_EMIT (" subs %[count], %[count], #32" )
75
+ __ASM_EMIT (" add %[src], %[src], 0x80" )
76
+ __ASM_EMIT (" add %[dst], %[dst], 0x80" )
77
+ __ASM_EMIT (" b.hs 1b" )
78
+
79
+ // 16x blocks
80
+ __ASM_EMIT (" 2:" )
81
+ __ASM_EMIT (" adds %[count], %[count], #16" )
82
+ __ASM_EMIT (" b.lt 4f" )
83
+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
84
+ __ASM_EMIT (" ldp q2, q3, [%[src], 0x20]" )
85
+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
86
+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
87
+ __ASM_EMIT (" rev32 v10.8h, v2.8h" )
88
+ __ASM_EMIT (" rev32 v11.8h, v3.8h" )
89
+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
90
+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
91
+ __ASM_EMIT (" bit v2.16b, v10.16b, v16.16b" )
92
+ __ASM_EMIT (" bit v3.16b, v11.16b, v17.16b" )
93
+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
94
+ __ASM_EMIT (" stp q2, q3, [%[dst], 0x20]" )
95
+ __ASM_EMIT (" sub %[count], %[count], #16" )
96
+ __ASM_EMIT (" add %[src], %[src], 0x40" )
97
+ __ASM_EMIT (" add %[dst], %[dst], 0x40" )
98
+
99
+ // 8x blocks
100
+ __ASM_EMIT (" 4:" )
101
+ __ASM_EMIT (" adds %[count], %[count], #8" )
102
+ __ASM_EMIT (" b.lt 6f" )
103
+ __ASM_EMIT (" ldp q0, q1, [%[src], 0x00]" ) // v0 = R G B A
104
+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
105
+ __ASM_EMIT (" rev32 v9.8h, v1.8h" )
106
+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
107
+ __ASM_EMIT (" bit v1.16b, v9.16b, v17.16b" )
108
+ __ASM_EMIT (" stp q0, q1, [%[dst], 0x00]" )
109
+ __ASM_EMIT (" sub %[count], %[count], #8" )
110
+ __ASM_EMIT (" add %[src], %[src], 0x20" )
111
+ __ASM_EMIT (" add %[dst], %[dst], 0x20" )
112
+
113
+ // 4x blocks
114
+ __ASM_EMIT (" 6:" )
115
+ __ASM_EMIT (" adds %[count], %[count], #4" )
116
+ __ASM_EMIT (" b.lt 8f" )
117
+ __ASM_EMIT (" ldr q0, [%[src], 0x00]" ) // v0 = R G B A
118
+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
119
+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
120
+ __ASM_EMIT (" str q0, [%[dst], 0x00]" )
121
+ __ASM_EMIT (" sub %[count], %[count], #4" )
122
+ __ASM_EMIT (" add %[src], %[src], 0x10" )
123
+ __ASM_EMIT (" add %[dst], %[dst], 0x10" )
124
+
125
+ // 1x blocks
126
+ __ASM_EMIT (" 8:" )
127
+ __ASM_EMIT (" adds %[count], %[count], #3" )
128
+ __ASM_EMIT (" b.lt 10f" )
129
+ __ASM_EMIT (" 9:" )
130
+ __ASM_EMIT (" ld1r {v0.4s}, [%[src]]" ) // v0 = R G B A
131
+ __ASM_EMIT (" rev32 v8.8h, v0.8h" ) // v8 = B A R G
132
+ __ASM_EMIT (" bit v0.16b, v8.16b, v16.16b" ) // v0 = B G R A
133
+ __ASM_EMIT (" st1 {v0.s}[0], [%[dst]]" )
134
+ __ASM_EMIT (" add %[src], %[src], 0x04" )
135
+ __ASM_EMIT (" add %[dst], %[dst], 0x04" )
136
+ __ASM_EMIT (" subs %[count], %[count], #1" )
137
+ __ASM_EMIT (" b.ge 9b" )
138
+
139
+ // End
140
+ __ASM_EMIT (" 10:" )
141
+ : [src] " +r" (src), [dst] " +r" (dst),
142
+ [count] " +r" (count)
143
+ : [XC] " r" (&rgba32_to_bgra32_const[0 ])
144
+ : " cc" , " memory" ,
145
+ " v0" , " v1" , " v2" , " v3" ,
146
+ " v4" , " v5" , " v6" , " v7" ,
147
+ " v8" , " v9" , " v10" , " v11" ,
148
+ " v12" , " v13" , " v14" , " v15" ,
149
+ " v16" , " v17"
150
+ );
151
+ }
152
+
33
153
IF_ARCH_AARCH64 (
34
154
static const uint32_t abgr32_to_bgrff32_const[] __lsp_aligned32 =
35
155
{
0 commit comments