Skip to content

Commit 86aa2dd

Browse files
committed
deduplicate "eclipse" #1743
CharDeduplication was not designed to deduplicate tokens with length 7+ which could lead to high memory consumption. With this change tokens of all sizes can be deduplicated. #1743 A benchmark implemented in CharDeduplicationTest.main(String[]) shows the new deduplication is performed at similar speed (.21s instead of .16s) but deduplicates much more tokens (99% instead of 36%).
1 parent dcafabb commit 86aa2dd

File tree

2 files changed

+87
-198
lines changed

2 files changed

+87
-198
lines changed

org.eclipse.jdt.core.compiler.batch/src/org/eclipse/jdt/internal/compiler/util/CharDeduplication.java

Lines changed: 53 additions & 186 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,17 @@ public class CharDeduplication {
2323

2424
// ----- immutable static part (thread safe): ----
2525

26+
private final static char[] CHAR_ARRAY0 = new char[0];
2627
static final char[] ASCII_CHARS[] = new char[128][];
2728
static {
2829
for (int i = 0; i < ASCII_CHARS.length; i++) {
2930
ASCII_CHARS[i] = new char[] { (char) i };
3031
}
3132
}
32-
public static final int TABLE_SIZE = 30; // XXX thats not a prime -> bad for hashing, nor a power of 2 -> expensive
33-
// modulo computation
34-
public static final int INTERNAL_TABLE_SIZE = 6; // 30*6 =180 entries
35-
36-
public static final int OPTIMIZED_LENGTH = 6;
37-
38-
private final static char[] CHAR_ARRAY0 = new char[0];
33+
/** size of hash table, does not affect performance due to hashing but affects memory */
34+
public static final int TABLE_SIZE = 8192; // a power of 2 to fast compute modulo
35+
/** number of entries to linear search affects performance but decreases collisions - does not affect memory */
36+
public static final int SEARCH_SIZE = 8; // a power of 2, has to be smaller then TABLE_SIZE
3937

4038
/** avoid OOME by additional CharDeduplication memory **/
4139
static final class CacheReference<T> {
@@ -59,56 +57,32 @@ T get() {
5957

6058
private final static ThreadLocal<CacheReference<CharDeduplication>> mutableCache = ThreadLocal.withInitial(()->new CacheReference<>(CharDeduplication::new));
6159

62-
private static final char[] optimizedCurrentTokenSource1(char[] source, int startPosition) {
63-
// optimization at no speed cost of 99.5 % of the singleCharIdentifier
64-
char charOne = source[startPosition];
65-
if (charOne < ASCII_CHARS.length) {
66-
return ASCII_CHARS[charOne];
67-
}
68-
return new char[] { charOne };
69-
}
70-
7160
/** @return an instance that is *not* thread safe. To be used in a single thread only. **/
7261
public static CharDeduplication getThreadLocalInstance() {
7362
return mutableCache.get().get();
7463
}
7564

7665
// ----- mutable non-static part (not thread safe!): ----
7766

78-
/** single threaded only **/
79-
public final char[][][][] charArray_length = new char[OPTIMIZED_LENGTH - 1][TABLE_SIZE][INTERNAL_TABLE_SIZE][];
80-
81-
int newEntry2 = 0;
82-
int newEntry3 = 0;
83-
int newEntry4 = 0;
84-
int newEntry5 = 0;
85-
int newEntry6 = 0;
67+
/** single threaded only, hashtable with restricted linear probing **/
68+
private final char[][] hashTable = new char[TABLE_SIZE][];
69+
private final int circularBufferPointer[] = new int[TABLE_SIZE];
8670

8771
private CharDeduplication() {
88-
init();
89-
}
90-
91-
private void init() {
92-
for (int i = 0; i < OPTIMIZED_LENGTH - 1; i++) {
93-
final char[] initCharArray = new char[i + 2];
94-
for (int j = 0; j < TABLE_SIZE; j++) {
95-
for (int k = 0; k < INTERNAL_TABLE_SIZE; k++) {
96-
this.charArray_length[i][j][k] = initCharArray;
97-
}
98-
}
99-
}
72+
// private
10073
}
10174

10275
/** public for test purpose only **/
10376
@Deprecated
10477
public void reset() {
105-
init();
78+
Arrays.fill(this.hashTable, null);
79+
Arrays.fill(this.circularBufferPointer, 0);
10680
}
10781

10882
/**
10983
* like Arrays.copyOfRange(source, from, to) but returns a cached instance of the former result if
11084
* available
111-
*
85+
*
11286
* @param from
11387
* start index (inclusive)
11488
* @param to
@@ -118,167 +92,60 @@ public void reset() {
11892
**/
11993
public char[] sharedCopyOfRange(char[] source, int from, int to) {
12094
int length = to - from;
121-
switch (length) { // see OptimizedLength
95+
switch (length) {
12296
case 1:
123-
return optimizedCurrentTokenSource1(source, from);
124-
case 2:
125-
return optimizedCurrentTokenSource2(source, from);
126-
case 3:
127-
return optimizedCurrentTokenSource3(source, from);
128-
case 4:
129-
return optimizedCurrentTokenSource4(source, from);
130-
case 5:
131-
return optimizedCurrentTokenSource5(source, from);
132-
case 6:
133-
return optimizedCurrentTokenSource6(source, from);
97+
char charOne = source[from];
98+
if (charOne < ASCII_CHARS.length) {
99+
return ASCII_CHARS[charOne];
100+
}
101+
break;
134102
case 0:
135103
return CHAR_ARRAY0;
136104
}
137-
return Arrays.copyOfRange(source, from, to);
138-
}
139-
140-
private final char[] optimizedCurrentTokenSource2(char[] source, int startPosition) {
141-
142-
char[] src = source;
143-
int start = startPosition;
144-
char c0, c1;
145-
int hash = (((c0 = src[start]) << 6) + (c1 = src[start + 1])) % TABLE_SIZE;
146-
char[][] table = this.charArray_length[0][hash];
147-
int i = this.newEntry2;
148-
while (++i < INTERNAL_TABLE_SIZE) {
149-
char[] charArray = table[i];
150-
if ((c0 == charArray[0]) && (c1 == charArray[1]))
151-
return charArray;
152-
}
153-
// ---------other side---------
154-
i = -1;
155-
int max = this.newEntry2;
156-
while (++i <= max) {
157-
char[] charArray = table[i];
158-
if ((c0 == charArray[0]) && (c1 == charArray[1]))
159-
return charArray;
160-
}
161-
// --------add the entry-------
162-
if (++max >= INTERNAL_TABLE_SIZE)
163-
max = 0;
164-
char[] r;
165-
System.arraycopy(src, start, r = new char[2], 0, 2);
166-
return table[this.newEntry2 = max] = r;
167-
}
168-
169-
private final char[] optimizedCurrentTokenSource3(char[] source, int startPosition) {
170-
char[] src = source;
171-
int start = startPosition;
172-
char c0, c1 = src[start + 1], c2;
173-
int hash = (((c0 = src[start]) << 6) + (c2 = src[start + 2])) % TABLE_SIZE;
174-
char[][] table = this.charArray_length[1][hash];
175-
int i = this.newEntry3;
176-
while (++i < INTERNAL_TABLE_SIZE) {
177-
char[] charArray = table[i];
178-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]))
179-
return charArray;
180-
}
181-
// ---------other side---------
182-
i = -1;
183-
int max = this.newEntry3;
184-
while (++i <= max) {
185-
char[] charArray = table[i];
186-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]))
187-
return charArray;
188-
}
189-
// --------add the entry-------
190-
if (++max >= INTERNAL_TABLE_SIZE)
191-
max = 0;
192-
char[] r;
193-
System.arraycopy(src, start, r = new char[3], 0, 3);
194-
return table[this.newEntry3 = max] = r;
195-
}
196-
197-
private final char[] optimizedCurrentTokenSource4(char[] source, int startPosition) {
198-
char[] src = source;
199-
int start = startPosition;
200-
char c0, c1 = src[start + 1], c2, c3 = src[start + 3];
201-
int hash = (((c0 = src[start]) << 6) + (c2 = src[start + 2])) % TABLE_SIZE;
202-
char[][] table = this.charArray_length[2][hash];
203-
int i = this.newEntry4;
204-
while (++i < INTERNAL_TABLE_SIZE) {
205-
char[] charArray = table[i];
206-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3]))
105+
int hash = hashCode(source, from, to);
106+
int circularBufferStart = hash & (TABLE_SIZE - 1);
107+
int positionToReplace = -1;
108+
// linear probing within circular buffer:
109+
for (int i = 0; i < SEARCH_SIZE; i++) {
110+
int position = (circularBufferStart + i) & (TABLE_SIZE - 1);
111+
char[] charArray = this.hashTable[position];
112+
if (charArray == null) {
113+
// this case only happens when the table is filling up,
114+
// but helps to get good deduplication fast
115+
positionToReplace = position;
116+
} else if (equals(source, from, to, charArray)) {
117+
// Successfully deduplicated:
207118
return charArray;
119+
}
208120
}
209-
// ---------other side---------
210-
i = -1;
211-
int max = this.newEntry4;
212-
while (++i <= max) {
213-
char[] charArray = table[i];
214-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3]))
215-
return charArray;
121+
char[] r = Arrays.copyOfRange(source, from, to);
122+
// not found -> overwrite existing entries in a circular buffer:
123+
if (positionToReplace == -1) {
124+
// no empty entry found - normal case:
125+
int j = this.circularBufferPointer[circularBufferStart]++;
126+
positionToReplace = (circularBufferStart + (j & (SEARCH_SIZE-1))) & (TABLE_SIZE - 1);
216127
}
217-
// --------add the entry-------
218-
if (++max >= INTERNAL_TABLE_SIZE)
219-
max = 0;
220-
char[] r;
221-
System.arraycopy(src, start, r = new char[4], 0, 4);
222-
return table[this.newEntry4 = max] = r;
128+
this.hashTable[positionToReplace] = r;
129+
return r;
223130
}
224131

225-
private final char[] optimizedCurrentTokenSource5(char[] source, int startPosition) {
226-
char[] src = source;
227-
int start = startPosition;
228-
char c0, c1 = src[start + 1], c2, c3 = src[start + 3], c4;
229-
int hash = (((c0 = src[start]) << 12) + ((c2 = src[start + 2]) << 6) + (c4 = src[start + 4])) % TABLE_SIZE;
230-
char[][] table = this.charArray_length[3][hash];
231-
int i = this.newEntry5;
232-
while (++i < INTERNAL_TABLE_SIZE) {
233-
char[] charArray = table[i];
234-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
235-
&& (c4 == charArray[4]))
236-
return charArray;
132+
private int hashCode(char[] source, int from, int to) {
133+
int result = source[from];
134+
for (int i = from + 1; i < to; i++) {
135+
result = 31 * result + source[i];
237136
}
238-
// ---------other side---------
239-
i = -1;
240-
int max = this.newEntry5;
241-
while (++i <= max) {
242-
char[] charArray = table[i];
243-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
244-
&& (c4 == charArray[4]))
245-
return charArray;
246-
}
247-
// --------add the entry-------
248-
if (++max >= INTERNAL_TABLE_SIZE)
249-
max = 0;
250-
char[] r;
251-
System.arraycopy(src, start, r = new char[5], 0, 5);
252-
return table[this.newEntry5 = max] = r;
137+
return result;
253138
}
254139

255-
private final char[] optimizedCurrentTokenSource6(char[] source, int startPosition) {
256-
char[] src = source;
257-
int start = startPosition;
258-
char c0, c1 = src[start + 1], c2, c3 = src[start + 3], c4, c5 = src[start + 5];
259-
int hash = (((c0 = src[start]) << 12) + ((c2 = src[start + 2]) << 6) + (c4 = src[start + 4])) % TABLE_SIZE;
260-
char[][] table = this.charArray_length[4][hash];
261-
int i = this.newEntry6;
262-
while (++i < INTERNAL_TABLE_SIZE) {
263-
char[] charArray = table[i];
264-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
265-
&& (c4 == charArray[4]) && (c5 == charArray[5]))
266-
return charArray;
140+
private boolean equals(char[] source, int from, int to, char[] charArray) {
141+
if (charArray.length != to - from) {
142+
return false;
267143
}
268-
// ---------other side---------
269-
i = -1;
270-
int max = this.newEntry6;
271-
while (++i <= max) {
272-
char[] charArray = table[i];
273-
if ((c0 == charArray[0]) && (c1 == charArray[1]) && (c2 == charArray[2]) && (c3 == charArray[3])
274-
&& (c4 == charArray[4]) && (c5 == charArray[5]))
275-
return charArray;
144+
for (int i = from; i < to; i++) {
145+
if (source[i] != charArray[i - from]) {
146+
return false;
147+
}
276148
}
277-
// --------add the entry-------
278-
if (++max >= INTERNAL_TABLE_SIZE)
279-
max = 0;
280-
char[] r;
281-
System.arraycopy(src, start, r = new char[6], 0, 6);
282-
return table[this.newEntry6 = max] = r;
149+
return true;
283150
}
284151
}

0 commit comments

Comments
 (0)