Skip to content

Commit a2a9a3b

Browse files
authored
Simplify ForDeltaUtil's prefix sum. (#14979)
I remember benchmarking prefix sums quite extensively, and unrolled loops performed significantly better than their rolled on counterpart, both on micro and macro benchmarks: ```java private static void prefixSum(int[] arr, int len) { for (int i = 1; i < len; ++i) { arr[i] += arr[i-1]; } } ``` However, I recently discovered that rewriting the loop this way performs much better, and almost on par with the unrolled variant: ```java private static void prefixSum(int[] arr, int len) { int sum = 0; for (int i = 0; i < len; ++i) { sum += arr[i]; arr[i] = sum; } } ```
1 parent 8a7b799 commit a2a9a3b

File tree

3 files changed

+20
-214
lines changed

3 files changed

+20
-214
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
{
2-
"lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java": "dc896d5df4b2a091918bfa14b30aad417feaffac",
3-
"lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py": "d622cc9f9a13987a07c2472a1c5b0111a3a7bc62"
2+
"lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java": "7e14917412c0f4e29453c5b604bf4dddf08d40d1",
3+
"lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py": "82845dcc4b25af1b944bffcaf44d1fdf53e126a0"
44
}

lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java

Lines changed: 9 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ private static void prefixSum8(int[] arr, int base) {
7575
// When the number of bits per value is 4 or less, we can sum up all values in a block without
7676
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
7777
// values at once.
78-
innerPrefixSum8(arr);
78+
prefixSum(arr, ONE_BLOCK_SIZE_FOURTH, 0);
7979
expand8(arr);
8080
final int l0 = base;
8181
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
@@ -94,7 +94,7 @@ private static void prefixSum16(int[] arr, int base) {
9494
// When the number of bits per value is 11 or less, we can sum up all values in a block without
9595
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
9696
// values at once.
97-
innerPrefixSum16(arr);
97+
prefixSum(arr, HALF_BLOCK_SIZE, 0);
9898
expand16(arr);
9999
final int l0 = base;
100100
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
@@ -105,112 +105,15 @@ private static void prefixSum16(int[] arr, int base) {
105105
}
106106

107107
private static void prefixSum32(int[] arr, int base) {
108-
arr[0] += base;
109-
for (int i = 1; i < BLOCK_SIZE; ++i) {
110-
arr[i] += arr[i - 1];
111-
}
112-
}
113-
114-
// For some reason unrolling seems to help
115-
private static void innerPrefixSum8(int[] arr) {
116-
arr[1] += arr[0];
117-
arr[2] += arr[1];
118-
arr[3] += arr[2];
119-
arr[4] += arr[3];
120-
arr[5] += arr[4];
121-
arr[6] += arr[5];
122-
arr[7] += arr[6];
123-
arr[8] += arr[7];
124-
arr[9] += arr[8];
125-
arr[10] += arr[9];
126-
arr[11] += arr[10];
127-
arr[12] += arr[11];
128-
arr[13] += arr[12];
129-
arr[14] += arr[13];
130-
arr[15] += arr[14];
131-
arr[16] += arr[15];
132-
arr[17] += arr[16];
133-
arr[18] += arr[17];
134-
arr[19] += arr[18];
135-
arr[20] += arr[19];
136-
arr[21] += arr[20];
137-
arr[22] += arr[21];
138-
arr[23] += arr[22];
139-
arr[24] += arr[23];
140-
arr[25] += arr[24];
141-
arr[26] += arr[25];
142-
arr[27] += arr[26];
143-
arr[28] += arr[27];
144-
arr[29] += arr[28];
145-
arr[30] += arr[29];
146-
arr[31] += arr[30];
108+
prefixSum(arr, BLOCK_SIZE, base);
147109
}
148110

149-
// For some reason unrolling seems to help
150-
private static void innerPrefixSum16(int[] arr) {
151-
arr[1] += arr[0];
152-
arr[2] += arr[1];
153-
arr[3] += arr[2];
154-
arr[4] += arr[3];
155-
arr[5] += arr[4];
156-
arr[6] += arr[5];
157-
arr[7] += arr[6];
158-
arr[8] += arr[7];
159-
arr[9] += arr[8];
160-
arr[10] += arr[9];
161-
arr[11] += arr[10];
162-
arr[12] += arr[11];
163-
arr[13] += arr[12];
164-
arr[14] += arr[13];
165-
arr[15] += arr[14];
166-
arr[16] += arr[15];
167-
arr[17] += arr[16];
168-
arr[18] += arr[17];
169-
arr[19] += arr[18];
170-
arr[20] += arr[19];
171-
arr[21] += arr[20];
172-
arr[22] += arr[21];
173-
arr[23] += arr[22];
174-
arr[24] += arr[23];
175-
arr[25] += arr[24];
176-
arr[26] += arr[25];
177-
arr[27] += arr[26];
178-
arr[28] += arr[27];
179-
arr[29] += arr[28];
180-
arr[30] += arr[29];
181-
arr[31] += arr[30];
182-
arr[32] += arr[31];
183-
arr[33] += arr[32];
184-
arr[34] += arr[33];
185-
arr[35] += arr[34];
186-
arr[36] += arr[35];
187-
arr[37] += arr[36];
188-
arr[38] += arr[37];
189-
arr[39] += arr[38];
190-
arr[40] += arr[39];
191-
arr[41] += arr[40];
192-
arr[42] += arr[41];
193-
arr[43] += arr[42];
194-
arr[44] += arr[43];
195-
arr[45] += arr[44];
196-
arr[46] += arr[45];
197-
arr[47] += arr[46];
198-
arr[48] += arr[47];
199-
arr[49] += arr[48];
200-
arr[50] += arr[49];
201-
arr[51] += arr[50];
202-
arr[52] += arr[51];
203-
arr[53] += arr[52];
204-
arr[54] += arr[53];
205-
arr[55] += arr[54];
206-
arr[56] += arr[55];
207-
arr[57] += arr[56];
208-
arr[58] += arr[57];
209-
arr[59] += arr[58];
210-
arr[60] += arr[59];
211-
arr[61] += arr[60];
212-
arr[62] += arr[61];
213-
arr[63] += arr[62];
111+
private static void prefixSum(int[] arr, int len, int base) {
112+
int sum = base;
113+
for (int i = 0; i < len; ++i) {
114+
sum += arr[i];
115+
arr[i] = sum;
116+
}
214117
}
215118

216119
private final int[] tmp = new int[BLOCK_SIZE];

lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py

Lines changed: 9 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@
101101
// When the number of bits per value is 4 or less, we can sum up all values in a block without
102102
// risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4
103103
// values at once.
104-
innerPrefixSum8(arr);
104+
prefixSum(arr, ONE_BLOCK_SIZE_FOURTH, 0);
105105
expand8(arr);
106106
final int l0 = base;
107107
final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1];
@@ -120,7 +120,7 @@
120120
// When the number of bits per value is 11 or less, we can sum up all values in a block without
121121
// risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2
122122
// values at once.
123-
innerPrefixSum16(arr);
123+
prefixSum(arr, HALF_BLOCK_SIZE, 0);
124124
expand16(arr);
125125
final int l0 = base;
126126
final int l1 = base + arr[HALF_BLOCK_SIZE - 1];
@@ -131,112 +131,15 @@
131131
}
132132
133133
private static void prefixSum32(int[] arr, int base) {
134-
arr[0] += base;
135-
for (int i = 1; i < BLOCK_SIZE; ++i) {
136-
arr[i] += arr[i-1];
137-
}
138-
}
139-
140-
// For some reason unrolling seems to help
141-
private static void innerPrefixSum8(int[] arr) {
142-
arr[1] += arr[0];
143-
arr[2] += arr[1];
144-
arr[3] += arr[2];
145-
arr[4] += arr[3];
146-
arr[5] += arr[4];
147-
arr[6] += arr[5];
148-
arr[7] += arr[6];
149-
arr[8] += arr[7];
150-
arr[9] += arr[8];
151-
arr[10] += arr[9];
152-
arr[11] += arr[10];
153-
arr[12] += arr[11];
154-
arr[13] += arr[12];
155-
arr[14] += arr[13];
156-
arr[15] += arr[14];
157-
arr[16] += arr[15];
158-
arr[17] += arr[16];
159-
arr[18] += arr[17];
160-
arr[19] += arr[18];
161-
arr[20] += arr[19];
162-
arr[21] += arr[20];
163-
arr[22] += arr[21];
164-
arr[23] += arr[22];
165-
arr[24] += arr[23];
166-
arr[25] += arr[24];
167-
arr[26] += arr[25];
168-
arr[27] += arr[26];
169-
arr[28] += arr[27];
170-
arr[29] += arr[28];
171-
arr[30] += arr[29];
172-
arr[31] += arr[30];
134+
prefixSum(arr, BLOCK_SIZE, base);
173135
}
174136
175-
// For some reason unrolling seems to help
176-
private static void innerPrefixSum16(int[] arr) {
177-
arr[1] += arr[0];
178-
arr[2] += arr[1];
179-
arr[3] += arr[2];
180-
arr[4] += arr[3];
181-
arr[5] += arr[4];
182-
arr[6] += arr[5];
183-
arr[7] += arr[6];
184-
arr[8] += arr[7];
185-
arr[9] += arr[8];
186-
arr[10] += arr[9];
187-
arr[11] += arr[10];
188-
arr[12] += arr[11];
189-
arr[13] += arr[12];
190-
arr[14] += arr[13];
191-
arr[15] += arr[14];
192-
arr[16] += arr[15];
193-
arr[17] += arr[16];
194-
arr[18] += arr[17];
195-
arr[19] += arr[18];
196-
arr[20] += arr[19];
197-
arr[21] += arr[20];
198-
arr[22] += arr[21];
199-
arr[23] += arr[22];
200-
arr[24] += arr[23];
201-
arr[25] += arr[24];
202-
arr[26] += arr[25];
203-
arr[27] += arr[26];
204-
arr[28] += arr[27];
205-
arr[29] += arr[28];
206-
arr[30] += arr[29];
207-
arr[31] += arr[30];
208-
arr[32] += arr[31];
209-
arr[33] += arr[32];
210-
arr[34] += arr[33];
211-
arr[35] += arr[34];
212-
arr[36] += arr[35];
213-
arr[37] += arr[36];
214-
arr[38] += arr[37];
215-
arr[39] += arr[38];
216-
arr[40] += arr[39];
217-
arr[41] += arr[40];
218-
arr[42] += arr[41];
219-
arr[43] += arr[42];
220-
arr[44] += arr[43];
221-
arr[45] += arr[44];
222-
arr[46] += arr[45];
223-
arr[47] += arr[46];
224-
arr[48] += arr[47];
225-
arr[49] += arr[48];
226-
arr[50] += arr[49];
227-
arr[51] += arr[50];
228-
arr[52] += arr[51];
229-
arr[53] += arr[52];
230-
arr[54] += arr[53];
231-
arr[55] += arr[54];
232-
arr[56] += arr[55];
233-
arr[57] += arr[56];
234-
arr[58] += arr[57];
235-
arr[59] += arr[58];
236-
arr[60] += arr[59];
237-
arr[61] += arr[60];
238-
arr[62] += arr[61];
239-
arr[63] += arr[62];
137+
private static void prefixSum(int[] arr, int len, int base) {
138+
int sum = base;
139+
for (int i = 0; i < len; ++i) {
140+
sum += arr[i];
141+
arr[i] = sum;
142+
}
240143
}
241144
242145
private final int[] tmp = new int[BLOCK_SIZE];

0 commit comments

Comments
 (0)