Skip to content

Commit 807a91b

Browse files
authored
Optimize Renderer::fillVerticesAndIndices() (#2065)
* Optimize `Renderer::fillVerticesAndIndices()` * Fix clobbered registers not being marked in inline assembly
1 parent 377f340 commit 807a91b

File tree

11 files changed

+559
-116
lines changed

11 files changed

+559
-116
lines changed

core/math/MathUtil.cpp

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/**
22
Copyright 2013 BlackBerry Inc.
33
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
4+
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
45
56
Licensed under the Apache License, Version 2.0 (the "License");
67
you may not use this file except in compliance with the License.
@@ -20,6 +21,7 @@ This file was modified to fit the cocos2d-x project
2021
*/
2122

2223
#include "math/MathUtil.h"
24+
#include "math/Mat4.h"
2325
#include "base/Macros.h"
2426

2527
#if (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
@@ -35,28 +37,29 @@ This file was modified to fit the cocos2d-x project
3537

3638
#if (AX_TARGET_PLATFORM == AX_PLATFORM_IOS)
3739
# if defined(__arm64__)
38-
# define USE_NEON64
39-
# define INCLUDE_NEON64
40+
# define USE_NEON64 1
41+
# define INCLUDE_NEON64 1
4042
# elif defined(__ARM_NEON__)
41-
# define USE_NEON32
42-
# define INCLUDE_NEON32
43-
# else
43+
# define USE_NEON32 1
44+
# define INCLUDE_NEON32 1
45+
# endif
46+
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_OSX)
47+
# if defined(__arm64__) || defined(__aarch64__)
48+
# define USE_NEON64 1
49+
# define INCLUDE_NEON64 1
4450
# endif
4551
#elif (AX_TARGET_PLATFORM == AX_PLATFORM_ANDROID)
4652
# if defined(__arm64__) || defined(__aarch64__)
47-
# define USE_NEON64
48-
# define INCLUDE_NEON64
53+
# define USE_NEON64 1
54+
# define INCLUDE_NEON64 1
4955
# elif defined(__ARM_NEON__)
50-
# define INCLUDE_NEON32
51-
# else
56+
# define INCLUDE_NEON32 1
5257
# endif
53-
#else
54-
5558
#endif
5659

5760
#if defined(AX_USE_SSE)
58-
# define USE_SSE
59-
# define INCLUDE_SSE
61+
# define USE_SSE 1
62+
# define INCLUDE_SSE 1
6063
#endif
6164

6265
#ifdef INCLUDE_NEON32
@@ -298,4 +301,34 @@ void MathUtil::crossVec3(const float* v1, const float* v2, float* dst)
298301
#endif
299302
}
300303

304+
void MathUtil::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
305+
{
306+
// Check some assumptions made by optimizations
307+
static_assert(sizeof(V3F_C4B_T2F) == 24);
308+
static_assert(offsetof(V3F_C4B_T2F, vertices) == 0);
309+
static_assert(offsetof(V3F_C4B_T2F, colors) == 12);
310+
static_assert(offsetof(V3F_C4B_T2F, texCoords) == 16);
311+
312+
#ifdef USE_NEON32
313+
MathUtilNeon::transformVertices(dst, src, count, transform);
314+
#elif defined(USE_NEON64)
315+
MathUtilNeon64::transformVertices(dst, src, count, transform);
316+
#elif defined(INCLUDE_NEON32)
317+
if (isNeon32Enabled())
318+
MathUtilNeon::transformVertices(dst, src, count, transform);
319+
else
320+
MathUtilC::transformVertices(dst, src, count, transform);
321+
#else
322+
MathUtilC::transformVertices(dst, src, count, transform);
323+
#endif
324+
}
325+
326+
void MathUtil::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset) {
327+
#if defined(USE_NEON64)
328+
MathUtilNeon64::transformIndices(dst, src, count, offset);
329+
#else
330+
MathUtilC::transformIndices(dst, src, count, offset);
331+
#endif
332+
}
333+
301334
NS_AX_MATH_END

core/math/MathUtil.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Copyright 2013 BlackBerry Inc.
33
Copyright (c) 2014-2017 Chukong Technologies
44
Copyright (c) 2017-2018 Xiamen Yaji Software Co., Ltd.
5+
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
56
67
Licensed under the Apache License, Version 2.0 (the "License");
78
you may not use this file except in compliance with the License.
@@ -29,13 +30,20 @@
2930

3031
#include "math/MathBase.h"
3132

33+
34+
NS_AX_BEGIN
35+
struct V3F_C4B_T2F;
36+
NS_AX_END
37+
3238
/**
3339
* @addtogroup base
3440
* @{
3541
*/
3642

3743
NS_AX_MATH_BEGIN
3844

45+
class Mat4;
46+
3947
/**
4048
* Defines a math utility class.
4149
*
@@ -45,6 +53,7 @@ class AX_DLL MathUtil
4553
{
4654
friend class Mat4;
4755
friend class Vec3;
56+
friend class Renderer;
4857

4958
public:
5059
/**
@@ -130,6 +139,9 @@ class AX_DLL MathUtil
130139
static void transformVec4(const float* m, const float* v, float* dst);
131140

132141
static void crossVec3(const float* v1, const float* v2, float* dst);
142+
143+
static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
144+
static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
133145
};
134146

135147
NS_AX_MATH_END

core/math/MathUtil.inl

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/**
22
Copyright 2013 BlackBerry Inc.
3+
Copyright (c) 2019-present Axmol Engine contributors (see AUTHORS.md).
34
45
Licensed under the Apache License, Version 2.0 (the "License");
56
you may not use this file except in compliance with the License.
@@ -24,24 +25,20 @@ class MathUtilC
2425
{
2526
public:
2627
inline static void addMatrix(const float* m, float scalar, float* dst);
27-
2828
inline static void addMatrix(const float* m1, const float* m2, float* dst);
29-
3029
inline static void subtractMatrix(const float* m1, const float* m2, float* dst);
31-
3230
inline static void multiplyMatrix(const float* m, float scalar, float* dst);
33-
3431
inline static void multiplyMatrix(const float* m1, const float* m2, float* dst);
35-
32+
3633
inline static void negateMatrix(const float* m, float* dst);
37-
3834
inline static void transposeMatrix(const float* m, float* dst);
39-
35+
4036
inline static void transformVec4(const float* m, float x, float y, float z, float w, float* dst);
41-
4237
inline static void transformVec4(const float* m, const float* v, float* dst);
43-
4438
inline static void crossVec3(const float* v1, const float* v2, float* dst);
39+
40+
inline static void transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform);
41+
inline static void transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset);
4542
};
4643

4744
inline void MathUtilC::addMatrix(const float* m, float scalar, float* dst)
@@ -128,27 +125,27 @@ inline void MathUtilC::multiplyMatrix(const float* m1, const float* m2, float* d
128125
{
129126
// Support the case where m1 or m2 is the same array as dst.
130127
float product[16];
131-
128+
132129
product[0] = m1[0] * m2[0] + m1[4] * m2[1] + m1[8] * m2[2] + m1[12] * m2[3];
133130
product[1] = m1[1] * m2[0] + m1[5] * m2[1] + m1[9] * m2[2] + m1[13] * m2[3];
134131
product[2] = m1[2] * m2[0] + m1[6] * m2[1] + m1[10] * m2[2] + m1[14] * m2[3];
135132
product[3] = m1[3] * m2[0] + m1[7] * m2[1] + m1[11] * m2[2] + m1[15] * m2[3];
136-
133+
137134
product[4] = m1[0] * m2[4] + m1[4] * m2[5] + m1[8] * m2[6] + m1[12] * m2[7];
138135
product[5] = m1[1] * m2[4] + m1[5] * m2[5] + m1[9] * m2[6] + m1[13] * m2[7];
139136
product[6] = m1[2] * m2[4] + m1[6] * m2[5] + m1[10] * m2[6] + m1[14] * m2[7];
140137
product[7] = m1[3] * m2[4] + m1[7] * m2[5] + m1[11] * m2[6] + m1[15] * m2[7];
141-
138+
142139
product[8] = m1[0] * m2[8] + m1[4] * m2[9] + m1[8] * m2[10] + m1[12] * m2[11];
143140
product[9] = m1[1] * m2[8] + m1[5] * m2[9] + m1[9] * m2[10] + m1[13] * m2[11];
144141
product[10] = m1[2] * m2[8] + m1[6] * m2[9] + m1[10] * m2[10] + m1[14] * m2[11];
145142
product[11] = m1[3] * m2[8] + m1[7] * m2[9] + m1[11] * m2[10] + m1[15] * m2[11];
146-
143+
147144
product[12] = m1[0] * m2[12] + m1[4] * m2[13] + m1[8] * m2[14] + m1[12] * m2[15];
148145
product[13] = m1[1] * m2[12] + m1[5] * m2[13] + m1[9] * m2[14] + m1[13] * m2[15];
149146
product[14] = m1[2] * m2[12] + m1[6] * m2[13] + m1[10] * m2[14] + m1[14] * m2[15];
150147
product[15] = m1[3] * m2[12] + m1[7] * m2[13] + m1[11] * m2[14] + m1[15] * m2[15];
151-
148+
152149
memcpy(dst, product, MATRIX_SIZE);
153150
}
154151

@@ -197,7 +194,7 @@ inline void MathUtilC::transformVec4(const float* m, const float* v, float* dst)
197194
float y = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + v[3] * m[13];
198195
float z = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + v[3] * m[14];
199196
float w = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + v[3] * m[15];
200-
197+
201198
dst[0] = x;
202199
dst[1] = y;
203200
dst[2] = z;
@@ -209,10 +206,39 @@ inline void MathUtilC::crossVec3(const float* v1, const float* v2, float* dst)
209206
float x = (v1[1] * v2[2]) - (v1[2] * v2[1]);
210207
float y = (v1[2] * v2[0]) - (v1[0] * v2[2]);
211208
float z = (v1[0] * v2[1]) - (v1[1] * v2[0]);
212-
209+
213210
dst[0] = x;
214211
dst[1] = y;
215212
dst[2] = z;
216213
}
217214

215+
inline void MathUtilC::transformVertices(V3F_C4B_T2F* dst, const V3F_C4B_T2F* src, size_t count, const Mat4& transform)
216+
{
217+
auto end = dst + count;
218+
auto t = transform; // Make copy for better aliasing inference
219+
auto m = t.m;
220+
221+
while (dst < end)
222+
{
223+
auto pos = src->vertices;
224+
dst->vertices.x = pos.x * m[0] + pos.y * m[4] + pos.z * m[8] + m[12];
225+
dst->vertices.y = pos.x * m[1] + pos.y * m[5] + pos.z * m[9] + m[13];
226+
dst->vertices.z = pos.x * m[2] + pos.y * m[6] + pos.z * m[10] + m[14];
227+
memcpy(&dst->colors, &src->colors, sizeof(dst->colors) + sizeof(dst->texCoords));
228+
++dst;
229+
++src;
230+
}
231+
}
232+
233+
inline void MathUtilC::transformIndices(uint16_t* dst, const uint16_t* src, size_t count, uint16_t offset)
234+
{
235+
auto end = dst + count;
236+
while (dst < end)
237+
{
238+
*dst = *src + offset;
239+
++dst;
240+
++src;
241+
}
242+
}
243+
218244
NS_AX_MATH_END

0 commit comments

Comments
 (0)