@@ -23,19 +23,17 @@ public class CharDeduplication {
2323
2424 // ----- immutable static part (thread safe): ----
2525
26+ private final static char [] CHAR_ARRAY0 = new char [0 ];
2627 static final char [] ASCII_CHARS [] = new char [128 ][];
2728 static {
2829 for (int i = 0 ; i < ASCII_CHARS .length ; i ++) {
2930 ASCII_CHARS [i ] = new char [] { (char ) i };
3031 }
3132 }
32- public static final int TABLE_SIZE = 30 ; // XXX thats not a prime -> bad for hashing, nor a power of 2 -> expensive
33- // modulo computation
34- public static final int INTERNAL_TABLE_SIZE = 6 ; // 30*6 =180 entries
35-
36- public static final int OPTIMIZED_LENGTH = 6 ;
37-
38- private final static char [] CHAR_ARRAY0 = new char [0 ];
33+ /** size of hash table, does not affect performance due to hashing but affects memory */
34+ public static final int TABLE_SIZE = 8192 ; // a power of 2 to fast compute modulo
35+ /** number of entries to linear search affects performance but decreases collisions - does not affect memory */
36+ public static final int SEARCH_SIZE = 8 ; // a power of 2, has to be smaller then TABLE_SIZE
3937
4038 /** avoid OOME by additional CharDeduplication memory **/
4139 static final class CacheReference <T > {
@@ -59,56 +57,32 @@ T get() {
5957
6058 private final static ThreadLocal <CacheReference <CharDeduplication >> mutableCache = ThreadLocal .withInitial (()->new CacheReference <>(CharDeduplication ::new ));
6159
62- private static final char [] optimizedCurrentTokenSource1 (char [] source , int startPosition ) {
63- // optimization at no speed cost of 99.5 % of the singleCharIdentifier
64- char charOne = source [startPosition ];
65- if (charOne < ASCII_CHARS .length ) {
66- return ASCII_CHARS [charOne ];
67- }
68- return new char [] { charOne };
69- }
70-
7160 /** @return an instance that is *not* thread safe. To be used in a single thread only. **/
7261 public static CharDeduplication getThreadLocalInstance () {
7362 return mutableCache .get ().get ();
7463 }
7564
7665 // ----- mutable non-static part (not thread safe!): ----
7766
78- /** single threaded only **/
79- public final char [][][][] charArray_length = new char [OPTIMIZED_LENGTH - 1 ][TABLE_SIZE ][INTERNAL_TABLE_SIZE ][];
80-
81- int newEntry2 = 0 ;
82- int newEntry3 = 0 ;
83- int newEntry4 = 0 ;
84- int newEntry5 = 0 ;
85- int newEntry6 = 0 ;
67+ /** single threaded only, hashtable with restricted linear probing **/
68+ private final char [][] hashTable = new char [TABLE_SIZE ][];
69+ private final int circularBufferPointer [] = new int [TABLE_SIZE ];
8670
8771 private CharDeduplication () {
88- init ();
89- }
90-
91- private void init () {
92- for (int i = 0 ; i < OPTIMIZED_LENGTH - 1 ; i ++) {
93- final char [] initCharArray = new char [i + 2 ];
94- for (int j = 0 ; j < TABLE_SIZE ; j ++) {
95- for (int k = 0 ; k < INTERNAL_TABLE_SIZE ; k ++) {
96- this .charArray_length [i ][j ][k ] = initCharArray ;
97- }
98- }
99- }
72+ // private
10073 }
10174
10275 /** public for test purpose only **/
10376 @ Deprecated
10477 public void reset () {
105- init ();
78+ Arrays .fill (this .hashTable , null );
79+ Arrays .fill (this .circularBufferPointer , 0 );
10680 }
10781
10882 /**
10983 * like Arrays.copyOfRange(source, from, to) but returns a cached instance of the former result if
11084 * available
111- *
85+ *
11286 * @param from
11387 * start index (inclusive)
11488 * @param to
@@ -118,167 +92,60 @@ public void reset() {
11892 **/
11993 public char [] sharedCopyOfRange (char [] source , int from , int to ) {
12094 int length = to - from ;
121- switch (length ) { // see OptimizedLength
95+ switch (length ) {
12296 case 1 :
123- return optimizedCurrentTokenSource1 (source , from );
124- case 2 :
125- return optimizedCurrentTokenSource2 (source , from );
126- case 3 :
127- return optimizedCurrentTokenSource3 (source , from );
128- case 4 :
129- return optimizedCurrentTokenSource4 (source , from );
130- case 5 :
131- return optimizedCurrentTokenSource5 (source , from );
132- case 6 :
133- return optimizedCurrentTokenSource6 (source , from );
97+ char charOne = source [from ];
98+ if (charOne < ASCII_CHARS .length ) {
99+ return ASCII_CHARS [charOne ];
100+ }
101+ break ;
134102 case 0 :
135103 return CHAR_ARRAY0 ;
136104 }
137- return Arrays .copyOfRange (source , from , to );
138- }
139-
140- private final char [] optimizedCurrentTokenSource2 (char [] source , int startPosition ) {
141-
142- char [] src = source ;
143- int start = startPosition ;
144- char c0 , c1 ;
145- int hash = (((c0 = src [start ]) << 6 ) + (c1 = src [start + 1 ])) % TABLE_SIZE ;
146- char [][] table = this .charArray_length [0 ][hash ];
147- int i = this .newEntry2 ;
148- while (++i < INTERNAL_TABLE_SIZE ) {
149- char [] charArray = table [i ];
150- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]))
151- return charArray ;
152- }
153- // ---------other side---------
154- i = -1 ;
155- int max = this .newEntry2 ;
156- while (++i <= max ) {
157- char [] charArray = table [i ];
158- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]))
159- return charArray ;
160- }
161- // --------add the entry-------
162- if (++max >= INTERNAL_TABLE_SIZE )
163- max = 0 ;
164- char [] r ;
165- System .arraycopy (src , start , r = new char [2 ], 0 , 2 );
166- return table [this .newEntry2 = max ] = r ;
167- }
168-
169- private final char [] optimizedCurrentTokenSource3 (char [] source , int startPosition ) {
170- char [] src = source ;
171- int start = startPosition ;
172- char c0 , c1 = src [start + 1 ], c2 ;
173- int hash = (((c0 = src [start ]) << 6 ) + (c2 = src [start + 2 ])) % TABLE_SIZE ;
174- char [][] table = this .charArray_length [1 ][hash ];
175- int i = this .newEntry3 ;
176- while (++i < INTERNAL_TABLE_SIZE ) {
177- char [] charArray = table [i ];
178- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]))
179- return charArray ;
180- }
181- // ---------other side---------
182- i = -1 ;
183- int max = this .newEntry3 ;
184- while (++i <= max ) {
185- char [] charArray = table [i ];
186- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]))
187- return charArray ;
188- }
189- // --------add the entry-------
190- if (++max >= INTERNAL_TABLE_SIZE )
191- max = 0 ;
192- char [] r ;
193- System .arraycopy (src , start , r = new char [3 ], 0 , 3 );
194- return table [this .newEntry3 = max ] = r ;
195- }
196-
197- private final char [] optimizedCurrentTokenSource4 (char [] source , int startPosition ) {
198- char [] src = source ;
199- int start = startPosition ;
200- char c0 , c1 = src [start + 1 ], c2 , c3 = src [start + 3 ];
201- int hash = (((c0 = src [start ]) << 6 ) + (c2 = src [start + 2 ])) % TABLE_SIZE ;
202- char [][] table = this .charArray_length [2 ][hash ];
203- int i = this .newEntry4 ;
204- while (++i < INTERNAL_TABLE_SIZE ) {
205- char [] charArray = table [i ];
206- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ]))
105+ int hash = hashCode (source , from , to );
106+ int circularBufferStart = hash & (TABLE_SIZE - 1 );
107+ int positionToReplace = -1 ;
108+ // linear probing within circular buffer:
109+ for (int i = 0 ; i < SEARCH_SIZE ; i ++) {
110+ int position = (circularBufferStart + i ) & (TABLE_SIZE - 1 );
111+ char [] charArray = this .hashTable [position ];
112+ if (charArray == null ) {
113+ // this case only happens when the table is filling up,
114+ // but helps to get good deduplication fast
115+ positionToReplace = position ;
116+ } else if (equals (source , from , to , charArray )) {
117+ // Successfully deduplicated:
207118 return charArray ;
119+ }
208120 }
209- // ---------other side---------
210- i = -1 ;
211- int max = this .newEntry4 ;
212- while (++i <= max ) {
213- char [] charArray = table [i ];
214- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ]))
215- return charArray ;
121+ char [] r = Arrays .copyOfRange (source , from , to );
122+ // not found -> overwrite existing entries in a circular buffer:
123+ if (positionToReplace == -1 ) {
124+ // no empty entry found - normal case:
125+ int j = this .circularBufferPointer [circularBufferStart ]++;
126+ positionToReplace = (circularBufferStart + (j & (SEARCH_SIZE -1 ))) & (TABLE_SIZE - 1 );
216127 }
217- // --------add the entry-------
218- if (++max >= INTERNAL_TABLE_SIZE )
219- max = 0 ;
220- char [] r ;
221- System .arraycopy (src , start , r = new char [4 ], 0 , 4 );
222- return table [this .newEntry4 = max ] = r ;
128+ this .hashTable [positionToReplace ] = r ;
129+ return r ;
223130 }
224131
225- private final char [] optimizedCurrentTokenSource5 (char [] source , int startPosition ) {
226- char [] src = source ;
227- int start = startPosition ;
228- char c0 , c1 = src [start + 1 ], c2 , c3 = src [start + 3 ], c4 ;
229- int hash = (((c0 = src [start ]) << 12 ) + ((c2 = src [start + 2 ]) << 6 ) + (c4 = src [start + 4 ])) % TABLE_SIZE ;
230- char [][] table = this .charArray_length [3 ][hash ];
231- int i = this .newEntry5 ;
232- while (++i < INTERNAL_TABLE_SIZE ) {
233- char [] charArray = table [i ];
234- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ])
235- && (c4 == charArray [4 ]))
236- return charArray ;
132+ private int hashCode (char [] source , int from , int to ) {
133+ int result = source [from ];
134+ for (int i = from + 1 ; i < to ; i ++) {
135+ result = 31 * result + source [i ];
237136 }
238- // ---------other side---------
239- i = -1 ;
240- int max = this .newEntry5 ;
241- while (++i <= max ) {
242- char [] charArray = table [i ];
243- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ])
244- && (c4 == charArray [4 ]))
245- return charArray ;
246- }
247- // --------add the entry-------
248- if (++max >= INTERNAL_TABLE_SIZE )
249- max = 0 ;
250- char [] r ;
251- System .arraycopy (src , start , r = new char [5 ], 0 , 5 );
252- return table [this .newEntry5 = max ] = r ;
137+ return result ;
253138 }
254139
255- private final char [] optimizedCurrentTokenSource6 (char [] source , int startPosition ) {
256- char [] src = source ;
257- int start = startPosition ;
258- char c0 , c1 = src [start + 1 ], c2 , c3 = src [start + 3 ], c4 , c5 = src [start + 5 ];
259- int hash = (((c0 = src [start ]) << 12 ) + ((c2 = src [start + 2 ]) << 6 ) + (c4 = src [start + 4 ])) % TABLE_SIZE ;
260- char [][] table = this .charArray_length [4 ][hash ];
261- int i = this .newEntry6 ;
262- while (++i < INTERNAL_TABLE_SIZE ) {
263- char [] charArray = table [i ];
264- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ])
265- && (c4 == charArray [4 ]) && (c5 == charArray [5 ]))
266- return charArray ;
140+ private boolean equals (char [] source , int from , int to , char [] charArray ) {
141+ if (charArray .length != to - from ) {
142+ return false ;
267143 }
268- // ---------other side---------
269- i = -1 ;
270- int max = this .newEntry6 ;
271- while (++i <= max ) {
272- char [] charArray = table [i ];
273- if ((c0 == charArray [0 ]) && (c1 == charArray [1 ]) && (c2 == charArray [2 ]) && (c3 == charArray [3 ])
274- && (c4 == charArray [4 ]) && (c5 == charArray [5 ]))
275- return charArray ;
144+ for (int i = from ; i < to ; i ++) {
145+ if (source [i ] != charArray [i - from ]) {
146+ return false ;
147+ }
276148 }
277- // --------add the entry-------
278- if (++max >= INTERNAL_TABLE_SIZE )
279- max = 0 ;
280- char [] r ;
281- System .arraycopy (src , start , r = new char [6 ], 0 , 6 );
282- return table [this .newEntry6 = max ] = r ;
149+ return true ;
283150 }
284151}
0 commit comments