Skip to content

Commit b02a30c

Browse files
committed
refactor: Update OpenAddressing HashSet implementation
1. Update private Tombstone class to align with CS2030S Maybe's implementation as much as possible. This is to reduce cognitive overhead when reading the implementation, assuming students are taking CS2030S along with CS2040S. 2. Update implementation of hashFunction to align with CS2040S. Previously, the hashFunction simply calculated a hash, and the probe sequence will then be generated by the linearProbe function. Similarly, a search function that was mostly the same as that of the linearProbe function was implemeneted to deal with null/tombstone buckets differently. This was not a good implementation, because of: - Naming - Tightly coupled methods The refactored hashFunction and linearProbe now takes in an extra parameter: collisions, and is easily extensible by changing a few LoCs. NOTE: The probeSequence generated by the hashFunction and linearProbe is not a "good" hash function as defined by CS2040S. While the hashFunction enumerates ALL possible buckets, the sequence is NOT a permutation of {1...m} where m is the number of buckets.
1 parent 55f3046 commit b02a30c

File tree

1 file changed

+103
-101
lines changed
  • src/main/java/dataStructures/hashSet/openAddressing

1 file changed

+103
-101
lines changed

src/main/java/dataStructures/hashSet/openAddressing/HashSet.java

Lines changed: 103 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,16 @@
2626
* Object::equals and Object::hashCode for the methods add, remove, and contains to be well-defined.
2727
*/
2828
public class HashSet<T> {
29-
private static final int INITIAL_CAPACITY = 16; // Initial capacity of the hash set.
30-
private static final double LOAD_FACTOR = 0.75; // Load factor threshold for resizing.
31-
private static final int ELEMENT_NOT_FOUND = -1;
29+
private static final int INITIAL_CAPACITY = 16; // Initial capacity of the hash set. Arbitrary.
30+
private static final double LOAD_FACTOR = 0.75; // Load factor threshold for resizing. Arbitrary.
3231
private int size; // Number of elements present in the Set (its cardinality).
32+
33+
// An array is used, instead of an ArrayList, to prevent automatic resizing. This introduces some complexity,
34+
// since Java arrays are covariant, which could lead to heap pollution if not properly handled.
3335
private T[] buckets;
34-
private final T tombstone;
3536

3637
/**
37-
* Creates a HashSet with an initial capacity of 16.
38+
* Creates an empty HashSet with an initial capacity of 16.
3839
*/
3940
public HashSet() {
4041
// Safe cast because the only way to add elements into this HashSet is through the add method, which
@@ -44,41 +45,44 @@ public HashSet() {
4445
this.buckets = tempBuckets;
4546
this.size = 0;
4647

47-
// There is no way to retrieve an instance of Tombstone. Therefore, it is safe to cast Tombstone to T.
48-
@SuppressWarnings("unchecked")
49-
T tempVar = (T) Tombstone.TOMBSTONE;
50-
this.tombstone = tempVar;
5148
}
5249

5350
/**
54-
* Adds the specified element to this set if it is not already present
51+
* Adds the specified element to this set if it is not already present.
5552
* If this set already contains the element, the call leaves the set unchanged and returns false.
5653
* <p>
5754
* If load factor (0.75) is exceeded, triggers a resize operation and double the current capacity.
5855
* It's important to note that resizing is not performed with every add operation but rather when the load
59-
* factor exceeds the threshold. Therefore, the amortized time complexity of adding elements remains O(1)
56+
* factor exceeds the threshold. Therefore, the amortized time complexity of adding elements remains O(1).
6057
*
61-
* @param element the element to be added to this set
62-
* @return true if this set did not already contain the specified
63-
* element
58+
* @param element the element to be added to this set.
59+
* @return true if this set did not already contain the specified element.
6460
*/
6561
public boolean add(T element) {
66-
if (this.contains(element)) {
67-
return false;
62+
if (contains(element)) {
63+
return false; // Element is not added.
6864
}
6965

7066
if (isLoadFactorExceeded()) {
71-
resize(this.capacity() * 2); // Resize to double the capacity.
67+
resize(capacity() * 2); // Resize to double the capacity.
7268
}
7369

74-
int bucketIndex = this.linearProbe(element);
75-
if (!this.isEmptyBucket(
76-
bucketIndex)) { // probe function returns the index of an empty bucket or the index containing the element.
77-
return false; // Duplicate elements are not added to the set.
70+
int collisions = 0;
71+
while (collisions < capacity()) {
72+
int bucketIndex = hashFunction(element, collisions);
73+
74+
// Insert into empty bucket.
75+
if (isEmptyBucket(bucketIndex)) {
76+
buckets[bucketIndex] = element;
77+
this.size++;
78+
return true;
79+
}
80+
81+
// Bucket is not empty
82+
collisions++;
7883
}
79-
this.buckets[bucketIndex] = element;
80-
this.size++;
81-
return true;
84+
85+
return false;
8286
}
8387

8488
/**
@@ -99,19 +103,28 @@ public boolean add(T element) {
99103
*/
100104
public boolean remove(T element) {
101105
// If load factor falls below 0.25 and still above minimum size (16), shrink the hashset by half.
102-
if (this.size() <= this.capacity() * 0.25 && this.capacity() / 2 >= INITIAL_CAPACITY) {
106+
if (size() <= capacity() * 0.25 && capacity() / 2 >= INITIAL_CAPACITY) {
103107
resize(this.capacity() / 2);
104108
}
105109

106-
int bucketIndex = this.search(element);
107-
// If the index returned by the probe function contains an empty bucket, then the element is not present in
108-
// the set.
109-
if (bucketIndex == ELEMENT_NOT_FOUND) {
110-
return false;
110+
int collisions = 0;
111+
while (collisions < capacity()) {
112+
int bucketIndex = hashFunction(element, collisions);
113+
114+
// Element is not removed, because it is not in the Set.
115+
if (isNullBucket(bucketIndex)) {
116+
return false;
117+
}
118+
119+
if (buckets[bucketIndex].equals(element)) {
120+
buckets[bucketIndex] = tombstone(); // Mark the current bucket with a Tombstone.
121+
size--;
122+
return true;
123+
}
124+
125+
collisions++;
111126
}
112-
this.buckets[bucketIndex] = this.tombstone; // marks the current bucket with a tombstone.
113-
this.size--;
114-
return true;
127+
return false;
115128
}
116129

117130
/**
@@ -121,15 +134,29 @@ public boolean remove(T element) {
121134
* @return true if this set contains the specified element
122135
*/
123136
public boolean contains(T element) {
124-
int bucketIndex = this.search(element);
137+
int collisions = 0;
138+
while (collisions < capacity()) {
139+
int bucketIndex = hashFunction(element, collisions);
140+
141+
// Invariant: Probe sequence is unbroken (no null values between buckets in the sequence).
142+
// This is maintained by add and delete.
143+
// This means that given a probe sequence e.g. (1, 2, 3, 4, 5, ...) for a given element, add will attempt to
144+
// add the element into the buckets in the given order. As a result, if an element is in bucket 3, there
145+
// will be elements in buckets 1 and 2, given that there must have been collisions for the element to be
146+
// added to bucket 3 instead of bucket 1, or bucket 2.
147+
// Similarly, to maintain that invariant, delete will not replace the element with null, but with a
148+
// marker (Tombstone).
149+
if (isNullBucket(bucketIndex)) {
150+
return false;
151+
}
125152

126-
if (bucketIndex == ELEMENT_NOT_FOUND) {
127-
return false;
153+
if (buckets[bucketIndex].equals(element)) {
154+
return true;
155+
}
156+
// Skips Tombstones/Deleted elements.
157+
collisions++;
128158
}
129-
130-
// Checks equality of element using Object::equals and Object::hashCode
131-
return element.equals(this.buckets[bucketIndex])
132-
&& element.hashCode() == this.buckets[bucketIndex].hashCode();
159+
return false;
133160
}
134161

135162
/**
@@ -156,8 +183,8 @@ public int size() {
156183
* @return the list representation of this HashSet.
157184
*/
158185
public List<T> toList() {
159-
return Arrays.stream(this.buckets)
160-
.filter(element -> element != null && !element.equals(this.tombstone))
186+
return Arrays.stream(buckets)
187+
.filter(element -> element != null && !element.equals(tombstone()))
161188
.collect(Collectors.toList());
162189
}
163190

@@ -173,6 +200,10 @@ public int capacity() {
173200

174201
/**
175202
* Hashes the specified element to determine the bucket index for placement within the array.
203+
* Note that the hashFunction for an open-addressed HashSet is defined differently from that of chaining,
204+
* in that it receives an extra <code>position</code> parameter. This position will be taken into account
205+
* based on the probe strategy (Linear Probing in this implementation).
206+
* <p>
176207
* The hash function calculates the index by performing the following steps:
177208
* <p>
178209
* 1. Obtains the hash code of the element using its `hashCode` method.
@@ -185,71 +216,33 @@ public int capacity() {
185216
* within the valid range of the array bounds.
186217
* This ensures that the index falls within the range of available buckets.
187218
*
188-
* @param element the element to be hashed
219+
* @param element the element to be hashed
220+
* @param collisions the number of collisions so far.
189221
* @return the bucket index where the element should be placed
190222
*/
191-
private int hashFunction(T element) {
192-
int hashCode = element.hashCode();
193-
return (hashCode & 0x7FFFFFFF) % buckets.length;
194-
}
223+
private int hashFunction(T element, int collisions) {
224+
int hashCode = element.hashCode() & 0x7FFFFFFF;
195225

196-
/**
197-
* Given an element, returns the index of an empty (defined as null OR tombstone) bucket to insert the element at.
198-
* If the element is already present in the HashSet, return its index.
199-
*
200-
* @param element the given element to probe an empty bucket for.
201-
* @return the index of an empty bucket.
202-
*/
203-
private int linearProbe(T element) {
204-
int startingProbeIndex = hashFunction(element);
205-
206-
int currentBucketIndex = startingProbeIndex;
207-
for (int i = 0; i < this.capacity() - 1; i++) {
208-
T existingElement = this.buckets[currentBucketIndex];
209-
// check for empty / available bucket.
210-
if (this.isEmptyBucket(currentBucketIndex)) {
211-
return currentBucketIndex;
212-
}
226+
// This step is where the OA and chaining implementation differs.
227+
int probeAdjustedHash = linearProbe(hashCode, collisions);
213228

214-
// check if element is equals to the element in the bucket.
215-
// Checks equality of element using Object::equals and Object::hashCode
216-
if (element.equals(existingElement)
217-
&& element.hashCode() == existingElement.hashCode()) {
218-
return currentBucketIndex;
219-
}
220-
currentBucketIndex = (currentBucketIndex + 1) % this.capacity();
221-
}
222-
return ELEMENT_NOT_FOUND; // placeholder return value for now. Will never reach this line.
229+
return probeAdjustedHash % capacity(); // Division-method. Not the most ideal.
223230
}
224231

225232
/**
226-
* Given an element, return the index of the bucket containing the element.
227-
* Performance degrades badly as load factor approaches 1.
233+
* Returns the new hash based on the number of collisions so far. Uses linear probing which increments the hash
234+
* linearly, hash + (c * collisions), where c is 1 in this case.
235+
* <p>
236+
* NOTE: Quadratic probing would look something like
237+
* hash + (c ^ collisions).
228238
*
229-
* @param element the element to look for.
230-
* @return the index of the bucket containing the element.
239+
* @param hash the original hash value, without accounting for the number of collisions. This would be the
240+
* same hash value as the chaining implementation.
241+
* @param collisions the number of collisions so far.
242+
* @return the new hash value, after adjusting for the number of collisions.
231243
*/
232-
private int search(T element) {
233-
int startingProbeIndex = hashFunction(element);
234-
235-
int currentBucketIndex = startingProbeIndex;
236-
for (int i = 0; i < this.capacity() - 1; i++) {
237-
// if bucket contains NULL, means element is not present because deleted elements are marked with tombstone.
238-
// That is to say given an arbitrary probe sequence of index 1, 2, 3, ..., there can never be a case where
239-
// there is a NULL bucket in the middle of the probe sequence; only tombstone markers.
240-
if (this.isNullBucket(currentBucketIndex)) {
241-
return ELEMENT_NOT_FOUND;
242-
}
243-
244-
// Checks equality of elements using Object::equals and Object::hashCode.
245-
if (this.buckets[currentBucketIndex].equals(element)
246-
&& this.buckets[currentBucketIndex].hashCode() == element.hashCode()) {
247-
return currentBucketIndex;
248-
}
249-
currentBucketIndex = (currentBucketIndex + 1) % this.capacity();
250-
}
251-
252-
return ELEMENT_NOT_FOUND; // element is not in the HashSet.
244+
private int linearProbe(int hash, int collisions) {
245+
return hash + collisions;
253246
}
254247

255248
/**
@@ -269,7 +262,7 @@ private boolean isEmptyBucket(int bucketIndex) {
269262
* @return true if the bucket contains null at the given bucketIndex.
270263
*/
271264
private boolean isNullBucket(int bucketIndex) {
272-
return this.buckets[bucketIndex] == null;
265+
return buckets[bucketIndex] == null;
273266
}
274267

275268
/**
@@ -279,7 +272,7 @@ private boolean isNullBucket(int bucketIndex) {
279272
* @return true if the bucket contains a Tombstone at the given bucketIndex.
280273
*/
281274
private boolean isTombstoneBucket(int bucketIndex) {
282-
return this.tombstone.equals(this.buckets[bucketIndex]);
275+
return tombstone().equals(buckets[bucketIndex]);
283276
}
284277

285278
/**
@@ -306,7 +299,7 @@ private void resize(int newCapacity) {
306299
// re-hashes every element and re-insert into the newly created buckets.
307300
Arrays.stream(temp)
308301
.filter(Objects::nonNull)
309-
.filter(element -> !element.equals(this.tombstone))
302+
.filter(element -> !element.equals(tombstone()))
310303
.forEach(this::add);
311304
}
312305

@@ -320,6 +313,14 @@ private boolean isLoadFactorExceeded() {
320313
return this.size() >= this.capacity() * LOAD_FACTOR;
321314
}
322315

316+
private static <T> T tombstone() {
317+
// It is safe to cast Tombstone to T, because methods retrieving elements (HashSet::get) from the HashSet
318+
// should, and will check whether the item is a Tombstone object, returning null in-place of the Tombstone.
319+
@SuppressWarnings("unchecked")
320+
T tombstone = (T) Tombstone.TOMBSTONE;
321+
return tombstone;
322+
}
323+
323324
/**
324325
* The `Tombstone` class is a marker used to represent removed elements in the `HashSet`.
325326
* When an element is removed from the set, its corresponding bucket is marked with a tombstone
@@ -335,6 +336,7 @@ private Tombstone() {}
335336

336337
/**
337338
* Checks if the given object is an instance of Tombstone.
339+
* Two Tombstone instances will always be the same.
338340
*
339341
* @param obj the object to compare
340342
* @return true if the object is an instance of Tombstone, false otherwise

0 commit comments

Comments
 (0)