26
26
* Object::equals and Object::hashCode for the methods add, remove, and contains to be well-defined.
27
27
*/
28
28
public class HashSet <T > {
29
- private static final int INITIAL_CAPACITY = 16 ; // Initial capacity of the hash set.
30
- private static final double LOAD_FACTOR = 0.75 ; // Load factor threshold for resizing.
31
- private static final int ELEMENT_NOT_FOUND = -1 ;
29
+ private static final int INITIAL_CAPACITY = 16 ; // Initial capacity of the hash set. Arbitrary.
30
+ private static final double LOAD_FACTOR = 0.75 ; // Load factor threshold for resizing. Arbitrary.
32
31
private int size ; // Number of elements present in the Set (its cardinality).
32
+
33
+ // An array is used, instead of an ArrayList, to prevent automatic resizing. This introduces some complexity,
34
+ // since Java arrays are covariant, which could lead to heap pollution if not properly handled.
33
35
private T [] buckets ;
34
- private final T tombstone ;
35
36
36
37
/**
37
- * Creates a HashSet with an initial capacity of 16.
38
+ * Creates an empty HashSet with an initial capacity of 16.
38
39
*/
39
40
public HashSet () {
40
41
// Safe cast because the only way to add elements into this HashSet is through the add method, which
@@ -44,41 +45,44 @@ public HashSet() {
44
45
this .buckets = tempBuckets ;
45
46
this .size = 0 ;
46
47
47
- // There is no way to retrieve an instance of Tombstone. Therefore, it is safe to cast Tombstone to T.
48
- @ SuppressWarnings ("unchecked" )
49
- T tempVar = (T ) Tombstone .TOMBSTONE ;
50
- this .tombstone = tempVar ;
51
48
}
52
49
53
50
/**
54
- * Adds the specified element to this set if it is not already present
51
+ * Adds the specified element to this set if it is not already present.
55
52
* If this set already contains the element, the call leaves the set unchanged and returns false.
56
53
* <p>
57
54
* If load factor (0.75) is exceeded, triggers a resize operation and double the current capacity.
58
55
* It's important to note that resizing is not performed with every add operation but rather when the load
59
- * factor exceeds the threshold. Therefore, the amortized time complexity of adding elements remains O(1)
56
+ * factor exceeds the threshold. Therefore, the amortized time complexity of adding elements remains O(1).
60
57
*
61
- * @param element the element to be added to this set
62
- * @return true if this set did not already contain the specified
63
- * element
58
+ * @param element the element to be added to this set.
59
+ * @return true if this set did not already contain the specified element.
64
60
*/
65
61
public boolean add (T element ) {
66
- if (this . contains (element )) {
67
- return false ;
62
+ if (contains (element )) {
63
+ return false ; // Element is not added.
68
64
}
69
65
70
66
if (isLoadFactorExceeded ()) {
71
- resize (this . capacity () * 2 ); // Resize to double the capacity.
67
+ resize (capacity () * 2 ); // Resize to double the capacity.
72
68
}
73
69
74
- int bucketIndex = this .linearProbe (element );
75
- if (!this .isEmptyBucket (
76
- bucketIndex )) { // probe function returns the index of an empty bucket or the index containing the element.
77
- return false ; // Duplicate elements are not added to the set.
70
+ int collisions = 0 ;
71
+ while (collisions < capacity ()) {
72
+ int bucketIndex = hashFunction (element , collisions );
73
+
74
+ // Insert into empty bucket.
75
+ if (isEmptyBucket (bucketIndex )) {
76
+ buckets [bucketIndex ] = element ;
77
+ this .size ++;
78
+ return true ;
79
+ }
80
+
81
+ // Bucket is not empty
82
+ collisions ++;
78
83
}
79
- this .buckets [bucketIndex ] = element ;
80
- this .size ++;
81
- return true ;
84
+
85
+ return false ;
82
86
}
83
87
84
88
/**
@@ -99,19 +103,28 @@ public boolean add(T element) {
99
103
*/
100
104
public boolean remove (T element ) {
101
105
// If load factor falls below 0.25 and still above minimum size (16), shrink the hashset by half.
102
- if (this . size () <= this . capacity () * 0.25 && this . capacity () / 2 >= INITIAL_CAPACITY ) {
106
+ if (size () <= capacity () * 0.25 && capacity () / 2 >= INITIAL_CAPACITY ) {
103
107
resize (this .capacity () / 2 );
104
108
}
105
109
106
- int bucketIndex = this .search (element );
107
- // If the index returned by the probe function contains an empty bucket, then the element is not present in
108
- // the set.
109
- if (bucketIndex == ELEMENT_NOT_FOUND ) {
110
- return false ;
110
+ int collisions = 0 ;
111
+ while (collisions < capacity ()) {
112
+ int bucketIndex = hashFunction (element , collisions );
113
+
114
+ // Element is not removed, because it is not in the Set.
115
+ if (isNullBucket (bucketIndex )) {
116
+ return false ;
117
+ }
118
+
119
+ if (buckets [bucketIndex ].equals (element )) {
120
+ buckets [bucketIndex ] = tombstone (); // Mark the current bucket with a Tombstone.
121
+ size --;
122
+ return true ;
123
+ }
124
+
125
+ collisions ++;
111
126
}
112
- this .buckets [bucketIndex ] = this .tombstone ; // marks the current bucket with a tombstone.
113
- this .size --;
114
- return true ;
127
+ return false ;
115
128
}
116
129
117
130
/**
@@ -121,15 +134,29 @@ public boolean remove(T element) {
121
134
* @return true if this set contains the specified element
122
135
*/
123
136
public boolean contains (T element ) {
124
- int bucketIndex = this .search (element );
137
+ int collisions = 0 ;
138
+ while (collisions < capacity ()) {
139
+ int bucketIndex = hashFunction (element , collisions );
140
+
141
+ // Invariant: Probe sequence is unbroken (no null values between buckets in the sequence).
142
+ // This is maintained by add and delete.
143
+ // This means that given a probe sequence e.g. (1, 2, 3, 4, 5, ...) for a given element, add will attempt to
144
+ // add the element into the buckets in the given order. As a result, if an element is in bucket 3, there
145
+ // will be elements in buckets 1 and 2, given that there must have been collisions for the element to be
146
+ // added to bucket 3 instead of bucket 1, or bucket 2.
147
+ // Similarly, to maintain that invariant, delete will not replace the element with null, but with a
148
+ // marker (Tombstone).
149
+ if (isNullBucket (bucketIndex )) {
150
+ return false ;
151
+ }
125
152
126
- if (bucketIndex == ELEMENT_NOT_FOUND ) {
127
- return false ;
153
+ if (buckets [bucketIndex ].equals (element )) {
154
+ return true ;
155
+ }
156
+ // Skips Tombstones/Deleted elements.
157
+ collisions ++;
128
158
}
129
-
130
- // Checks equality of element using Object::equals and Object::hashCode
131
- return element .equals (this .buckets [bucketIndex ])
132
- && element .hashCode () == this .buckets [bucketIndex ].hashCode ();
159
+ return false ;
133
160
}
134
161
135
162
/**
@@ -156,8 +183,8 @@ public int size() {
156
183
* @return the list representation of this HashSet.
157
184
*/
158
185
public List <T > toList () {
159
- return Arrays .stream (this . buckets )
160
- .filter (element -> element != null && !element .equals (this . tombstone ))
186
+ return Arrays .stream (buckets )
187
+ .filter (element -> element != null && !element .equals (tombstone () ))
161
188
.collect (Collectors .toList ());
162
189
}
163
190
@@ -173,6 +200,10 @@ public int capacity() {
173
200
174
201
/**
175
202
* Hashes the specified element to determine the bucket index for placement within the array.
203
+ * Note that the hashFunction for an open-addressed HashSet is defined differently from that of chaining,
204
+ * in that it receives an extra <code>position</code> parameter. This position will be taken into account
205
+ * based on the probe strategy (Linear Probing in this implementation).
206
+ * <p>
176
207
* The hash function calculates the index by performing the following steps:
177
208
* <p>
178
209
* 1. Obtains the hash code of the element using its `hashCode` method.
@@ -185,71 +216,33 @@ public int capacity() {
185
216
* within the valid range of the array bounds.
186
217
* This ensures that the index falls within the range of available buckets.
187
218
*
188
- * @param element the element to be hashed
219
+ * @param element the element to be hashed
220
+ * @param collisions the number of collisions so far.
189
221
* @return the bucket index where the element should be placed
190
222
*/
191
- private int hashFunction (T element ) {
192
- int hashCode = element .hashCode ();
193
- return (hashCode & 0x7FFFFFFF ) % buckets .length ;
194
- }
223
+ private int hashFunction (T element , int collisions ) {
224
+ int hashCode = element .hashCode () & 0x7FFFFFFF ;
195
225
196
- /**
197
- * Given an element, returns the index of an empty (defined as null OR tombstone) bucket to insert the element at.
198
- * If the element is already present in the HashSet, return its index.
199
- *
200
- * @param element the given element to probe an empty bucket for.
201
- * @return the index of an empty bucket.
202
- */
203
- private int linearProbe (T element ) {
204
- int startingProbeIndex = hashFunction (element );
205
-
206
- int currentBucketIndex = startingProbeIndex ;
207
- for (int i = 0 ; i < this .capacity () - 1 ; i ++) {
208
- T existingElement = this .buckets [currentBucketIndex ];
209
- // check for empty / available bucket.
210
- if (this .isEmptyBucket (currentBucketIndex )) {
211
- return currentBucketIndex ;
212
- }
226
+ // This step is where the OA and chaining implementation differs.
227
+ int probeAdjustedHash = linearProbe (hashCode , collisions );
213
228
214
- // check if element is equals to the element in the bucket.
215
- // Checks equality of element using Object::equals and Object::hashCode
216
- if (element .equals (existingElement )
217
- && element .hashCode () == existingElement .hashCode ()) {
218
- return currentBucketIndex ;
219
- }
220
- currentBucketIndex = (currentBucketIndex + 1 ) % this .capacity ();
221
- }
222
- return ELEMENT_NOT_FOUND ; // placeholder return value for now. Will never reach this line.
229
+ return probeAdjustedHash % capacity (); // Division-method. Not the most ideal.
223
230
}
224
231
225
232
/**
226
- * Given an element, return the index of the bucket containing the element.
227
- * Performance degrades badly as load factor approaches 1.
233
+ * Returns the new hash based on the number of collisions so far. Uses linear probing which increments the hash
234
+ * linearly, hash + (c * collisions), where c is 1 in this case.
235
+ * <p>
236
+ * NOTE: Quadratic probing would look something like
237
+ * hash + (c ^ collisions).
228
238
*
229
- * @param element the element to look for.
230
- * @return the index of the bucket containing the element.
239
+ * @param hash the original hash value, without accounting for the number of collisions. This would be the
240
+ * same hash value as the chaining implementation.
241
+ * @param collisions the number of collisions so far.
242
+ * @return the new hash value, after adjusting for the number of collisions.
231
243
*/
232
- private int search (T element ) {
233
- int startingProbeIndex = hashFunction (element );
234
-
235
- int currentBucketIndex = startingProbeIndex ;
236
- for (int i = 0 ; i < this .capacity () - 1 ; i ++) {
237
- // if bucket contains NULL, means element is not present because deleted elements are marked with tombstone.
238
- // That is to say given an arbitrary probe sequence of index 1, 2, 3, ..., there can never be a case where
239
- // there is a NULL bucket in the middle of the probe sequence; only tombstone markers.
240
- if (this .isNullBucket (currentBucketIndex )) {
241
- return ELEMENT_NOT_FOUND ;
242
- }
243
-
244
- // Checks equality of elements using Object::equals and Object::hashCode.
245
- if (this .buckets [currentBucketIndex ].equals (element )
246
- && this .buckets [currentBucketIndex ].hashCode () == element .hashCode ()) {
247
- return currentBucketIndex ;
248
- }
249
- currentBucketIndex = (currentBucketIndex + 1 ) % this .capacity ();
250
- }
251
-
252
- return ELEMENT_NOT_FOUND ; // element is not in the HashSet.
244
+ private int linearProbe (int hash , int collisions ) {
245
+ return hash + collisions ;
253
246
}
254
247
255
248
/**
@@ -269,7 +262,7 @@ private boolean isEmptyBucket(int bucketIndex) {
269
262
* @return true if the bucket contains null at the given bucketIndex.
270
263
*/
271
264
private boolean isNullBucket (int bucketIndex ) {
272
- return this . buckets [bucketIndex ] == null ;
265
+ return buckets [bucketIndex ] == null ;
273
266
}
274
267
275
268
/**
@@ -279,7 +272,7 @@ private boolean isNullBucket(int bucketIndex) {
279
272
* @return true if the bucket contains a Tombstone at the given bucketIndex.
280
273
*/
281
274
private boolean isTombstoneBucket (int bucketIndex ) {
282
- return this . tombstone .equals (this . buckets [bucketIndex ]);
275
+ return tombstone () .equals (buckets [bucketIndex ]);
283
276
}
284
277
285
278
/**
@@ -306,7 +299,7 @@ private void resize(int newCapacity) {
306
299
// re-hashes every element and re-insert into the newly created buckets.
307
300
Arrays .stream (temp )
308
301
.filter (Objects ::nonNull )
309
- .filter (element -> !element .equals (this . tombstone ))
302
+ .filter (element -> !element .equals (tombstone () ))
310
303
.forEach (this ::add );
311
304
}
312
305
@@ -320,6 +313,14 @@ private boolean isLoadFactorExceeded() {
320
313
return this .size () >= this .capacity () * LOAD_FACTOR ;
321
314
}
322
315
316
+ private static <T > T tombstone () {
317
+ // It is safe to cast Tombstone to T, because methods retrieving elements (HashSet::get) from the HashSet
318
+ // should, and will check whether the item is a Tombstone object, returning null in-place of the Tombstone.
319
+ @ SuppressWarnings ("unchecked" )
320
+ T tombstone = (T ) Tombstone .TOMBSTONE ;
321
+ return tombstone ;
322
+ }
323
+
323
324
/**
324
325
* The `Tombstone` class is a marker used to represent removed elements in the `HashSet`.
325
326
* When an element is removed from the set, its corresponding bucket is marked with a tombstone
@@ -335,6 +336,7 @@ private Tombstone() {}
335
336
336
337
/**
337
338
* Checks if the given object is an instance of Tombstone.
339
+ * Two Tombstone instances will always be the same.
338
340
*
339
341
* @param obj the object to compare
340
342
* @return true if the object is an instance of Tombstone, false otherwise
0 commit comments