Skip to content

Commit 7834f3a

Browse files
authored
Merge pull request #21 from euchangxian/branch-hashSet-openAddressing
WIP Add HashSet which uses Open-Addressing to resolve collisions
2 parents 496c1c0 + fb64914 commit 7834f3a

File tree

5 files changed

+534
-3
lines changed

5 files changed

+534
-3
lines changed

src/dataStructures/hashSet/HashSet.java renamed to src/dataStructures/hashSet/chaining/HashSet.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package src.dataStructures.hashSet;
1+
package src.dataStructures.hashSet.chaining;
22

33
import src.dataStructures.linkedList.LinkedList;
44

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
package src.dataStructures.hashSet.openAddressing;
2+
3+
import java.util.Arrays;
4+
import java.util.List;
5+
import java.util.Objects;
6+
import java.util.stream.Collectors;
7+
8+
/**
9+
* Implementation of a HashSet that uses Open Addressing and linear probing to resolve collisions.
10+
*
11+
* <p>The time complexity of operations in this HashSet implementation consists of two components. Firstly, there is the time to
12+
* compute the hash value, which is typically a constant-time operation. Secondly, there is the time to access the corresponding
13+
* bucket, which involves probing the buckets using linear probing.
14+
*
15+
* <p>Public methods (along with their time-complexity):
16+
* boolean add(T element) adds the given element into the HashSet. Expected O(1) assuming SUHA.
17+
* boolean contains(T element) checks if the given element is present in the HashSet. Expected O(1) assuming SUHA.
18+
* boolean remove(T element) removes the given element in the HashSet. Expected O(1) assuming SUHA.
19+
* List<T> toList() returns a List representation of this HashSet. O(n).
20+
* int size() gets the number of elements (cardinality) in this HashSet. O(1).
21+
* boolean isEmpty() checks if the HashSet is empty. O(1).
22+
* int capacity() returns the capacity of this HashSet. O(1).
23+
*
24+
* @param <T> the type of objects that are contained within this HashSet. T must override
25+
* Object::equals and Object::hashCode for the methods add, remove, and contains to be well-defined.
26+
*/
27+
public class HashSet<T>{
28+
private final int INITIAL_CAPACITY = 16; // Initial capacity of the hash set.
29+
private final double LOAD_FACTOR = 0.75; // Load factor threshold for resizing.
30+
private final int ELEMENT_NOT_FOUND = -1;
31+
private int size; // Number of elements present in the Set (its cardinality).
32+
private T[] buckets;
33+
private final T TOMBSTONE;
34+
35+
/**
36+
* Creates a HashSet with an initial capacity of 16.
37+
*/
38+
public HashSet() {
39+
// Safe cast because the only way to add elements into this HashSet is through the add method, which
40+
// only takes in elements of type T.
41+
@SuppressWarnings("unchecked")
42+
T[] tempBuckets = (T[]) new Object[INITIAL_CAPACITY];
43+
this.buckets = tempBuckets;
44+
this.size = 0;
45+
46+
// There is no way to retrieve an instance of Tombstone. Therefore, it is safe to cast Tombstone to T.
47+
@SuppressWarnings("unchecked")
48+
T tempVar = (T) Tombstone.TOMBSTONE;
49+
this.TOMBSTONE = tempVar;
50+
}
51+
52+
/**
53+
* Adds the specified element to this set if it is not already present
54+
* If this set already contains the element, the call leaves the set unchanged and returns false.
55+
* <p>
56+
* If load factor (0.75) is exceeded, triggers a resize operation and double the current capacity.
57+
* It's important to note that resizing is not performed with every add operation but rather when the load
58+
* factor exceeds the threshold. Therefore, the amortized time complexity of adding elements remains O(1)
59+
*
60+
* @param element the element to be added to this set
61+
* @return true if this set did not already contain the specified
62+
* element
63+
*/
64+
public boolean add(T element) {
65+
if (this.contains(element)) {
66+
return false;
67+
}
68+
69+
if (isLoadFactorExceeded()) {
70+
resize(this.capacity() * 2); // Resize to double the capacity.
71+
}
72+
73+
int bucketIndex = this.linearProbe(element);
74+
if (!this.isEmptyBucket(bucketIndex)) { // probe function returns the index of an empty bucket or the index containing the element.
75+
return false; // Duplicate elements are not added to the set.
76+
}
77+
this.buckets[bucketIndex] = element;
78+
this.size++;
79+
return true;
80+
}
81+
82+
/**
83+
* Removes the specified element from this set if it is present. Returns true if this set
84+
* contained the element (or equivalently, if this set changed as a result of the call).
85+
* (This set will not contain the element once the call returns.)
86+
*<p>
87+
* Removed elements are replaced with a Tombstone instead of NULL. This is to prevent search from terminating earlier
88+
* than expected when looking for an element.
89+
* <p>
90+
* If load factor falls below 0.25, trigger a resize and halve the current capacity.
91+
* It's important to note that resizing is not performed with every remove operation but rather when the
92+
* load factor falls below a certain limit. Therefore, the amortized time complexity of removing elements
93+
* remains O(1)
94+
*
95+
* @param element the element to be removed from this set, if present
96+
* @return true if this set contained the specified element
97+
*/
98+
public boolean remove(T element) {
99+
// If load factor falls below 0.25 and still above minimum size (16), shrink the hashset by half.
100+
if (this.size() <= this.capacity() * 0.25 && this.capacity() / 2 >= INITIAL_CAPACITY) {
101+
resize(this.capacity() / 2);
102+
}
103+
104+
int bucketIndex = this.search(element);
105+
if (bucketIndex == ELEMENT_NOT_FOUND) {
106+
return false; // If the index returned by the probe function contains an empty bucket, then the element is not present in the set.
107+
}
108+
this.buckets[bucketIndex] = this.TOMBSTONE; // marks the current bucket with a TOMBSTONE.
109+
this.size--;
110+
return true;
111+
}
112+
113+
/**
114+
* Returns true if this set contains the specified element.
115+
*
116+
* @param element the element whose presence in this set is to be tested
117+
* @return true if this set contains the specified element
118+
*/
119+
public boolean contains(T element) {
120+
int bucketIndex = this.search(element);
121+
122+
if (bucketIndex == ELEMENT_NOT_FOUND) {
123+
return false;
124+
}
125+
126+
// Checks equality of element using Object::equals and Object::hashCode
127+
return element.equals(this.buckets[bucketIndex])
128+
&& element.hashCode() == this.buckets[bucketIndex].hashCode();
129+
}
130+
131+
/**
132+
* Returns true if this HashSet is empty (Cardinality is zero). False otherwise.
133+
*
134+
* @return true if this HashSet is empty, false otherwise.
135+
*/
136+
public boolean isEmpty() {
137+
return this.size() == 0;
138+
}
139+
140+
/**
141+
* Returns the number of elements present in this HashSet (its cardinality).
142+
*
143+
* @return the number of elements present in this HashSet.
144+
*/
145+
public int size() {
146+
return this.size;
147+
}
148+
149+
/**
150+
* Returns the list representation of this HashSet.
151+
*
152+
* @return the list representation of this HashSet.
153+
*/
154+
public List<T> toList() {
155+
return Arrays.stream(this.buckets)
156+
.filter(element -> element != null && !element.equals(this.TOMBSTONE))
157+
.collect(Collectors.toList());
158+
}
159+
160+
/**
161+
* Returns the number of buckets of this HashSet. Equivalently, returns the maximum number of elements that can
162+
* be stored in this HashSet.
163+
*
164+
* @return the number of buckets of this HashSet.
165+
*/
166+
public int capacity() {
167+
return this.buckets.length; // returns the number of buckets.
168+
}
169+
170+
/**
171+
* Hashes the specified element to determine the bucket index for placement within the array.
172+
* The hash function calculates the index by performing the following steps:
173+
* <p>
174+
* 1. Obtains the hash code of the element using its `hashCode` method.
175+
* <p>
176+
* 2. Applies a bitwise AND operation with `0x7FFFFFFF` to clear the sign bit of the hash code,
177+
* ensuring that the resulting value is a non-negative integer.
178+
* This is necessary because array indices must be non-negative to access elements correctly.
179+
* <p>
180+
* 3. Performs the modulus operation (%) with the length of the `buckets` array to wrap the index
181+
* within the valid range of the array bounds.
182+
* This ensures that the index falls within the range of available buckets.
183+
*
184+
* @param element the element to be hashed
185+
* @return the bucket index where the element should be placed
186+
*/
187+
private int hashFunction(T element) {
188+
int hashCode = element.hashCode();
189+
return (hashCode & 0x7FFFFFFF) % buckets.length;
190+
}
191+
192+
/**
193+
* Given an element, returns the index of an empty (defined as null OR tombstone) bucket to insert the element at.
194+
* If the element is already present in the HashSet, return its index.
195+
*
196+
* @param element the given element to probe an empty bucket for.
197+
* @return the index of an empty bucket.
198+
*/
199+
private int linearProbe(T element) {
200+
int startingProbeIndex = hashFunction(element);
201+
202+
int currentBucketIndex = startingProbeIndex;
203+
for (int i = 0; i < this.capacity() - 1; i ++) {
204+
T existingElement = this.buckets[currentBucketIndex];
205+
// check for empty / available bucket.
206+
if (this.isEmptyBucket(currentBucketIndex)) {
207+
return currentBucketIndex;
208+
}
209+
210+
// check if element is equals to the element in the bucket.
211+
// Checks equality of element using Object::equals and Object::hashCode
212+
if (element.equals(existingElement)
213+
&& element.hashCode() == existingElement.hashCode()) {
214+
return currentBucketIndex;
215+
}
216+
currentBucketIndex = (currentBucketIndex + 1) % this.capacity();
217+
}
218+
return ELEMENT_NOT_FOUND; // placeholder return value for now. Will never reach this line.
219+
}
220+
221+
/**
222+
* Given an element, return the index of the bucket containing the element.
223+
* Performance degrades badly as load factor approaches 1.
224+
*
225+
* @param element the element to look for.
226+
* @return the index of the bucket containing the element.
227+
*/
228+
private int search(T element) {
229+
int startingProbeIndex = hashFunction(element);
230+
231+
int currentBucketIndex = startingProbeIndex;
232+
for (int i = 0; i < this.capacity() - 1; i++) {
233+
// if bucket contains NULL, means element is not present because deleted elements are marked with TOMBSTONE.
234+
// That is to say given an arbitrary probe sequence of index 1, 2, 3, ..., there can never be a case where
235+
// there is a NULL bucket in the middle of the probe sequence; only TOMBSTONE markers.
236+
if (this.isNullBucket(currentBucketIndex)) {
237+
return ELEMENT_NOT_FOUND;
238+
}
239+
240+
// Checks equality of elements using Object::equals and Object::hashCode.
241+
if (this.buckets[currentBucketIndex].equals(element)
242+
&& this.buckets[currentBucketIndex].hashCode() == element.hashCode()) {
243+
return currentBucketIndex;
244+
}
245+
currentBucketIndex = (currentBucketIndex + 1) % this.capacity();
246+
}
247+
248+
return ELEMENT_NOT_FOUND; // element is not in the HashSet.
249+
}
250+
251+
/**
252+
* Returns true if the bucket at the given bucketIndex contains no elements (Either null or Tombstone).
253+
*
254+
* @param bucketIndex the given index of the bucket to check.
255+
* @return true if the bucket at the given index contains no element, false otherwise.
256+
*/
257+
private boolean isEmptyBucket(int bucketIndex) {
258+
return this.isNullBucket(bucketIndex) || this.isTombstoneBucket(bucketIndex);
259+
}
260+
261+
/**
262+
* Returns true if the bucket at the given bucketIndex contains null.
263+
*
264+
* @param bucketIndex the given index of the bucket to check.
265+
* @return true if the bucket contains null at the given bucketIndex.
266+
*/
267+
private boolean isNullBucket(int bucketIndex) {
268+
return this.buckets[bucketIndex] == null;
269+
}
270+
271+
/**
272+
* Returns true if the bucket at the given bucketIndex contains a Tombstone marker.
273+
*
274+
* @param bucketIndex the given index of the bucket to check.
275+
* @return true if the bucket contains a Tombstone at the given bucketIndex.
276+
*/
277+
private boolean isTombstoneBucket(int bucketIndex) {
278+
return this.TOMBSTONE.equals(this.buckets[bucketIndex]);
279+
}
280+
281+
/**
282+
* If the load factor is exceeded, the capacity is increased by doubling it (possibly triggered after an add operation),
283+
* or if the load factor falls below 1/4 (arbitrary) of the capacity (and the capacity is larger than the minimum capacity), the
284+
* capacity is decreased by halving it (possibly triggered after a remove operation).
285+
* <p>
286+
* The resizing operation involves rehashing all existing elements into a new array with the updated capacity.
287+
* This process takes O(n) time, where n is the number of elements in the hash set.
288+
*/
289+
private void resize(int newCapacity) {
290+
// creates a temporary reference to the original bucket
291+
T[] temp = this.buckets;
292+
293+
// Safe cast because the only way to add elements into this HashSet is through the add method, which
294+
// only takes in elements of type T.
295+
@SuppressWarnings("unchecked")
296+
T[] newBuckets = (T[]) new Object[newCapacity];
297+
this.buckets = newBuckets;
298+
this.size = 0;
299+
300+
// re-hashes every element and re-insert into the newly created buckets.
301+
Arrays.stream(temp)
302+
.filter(Objects::nonNull)
303+
.filter(element -> !element.equals(this.TOMBSTONE))
304+
.forEach(this::add);
305+
}
306+
307+
/**
308+
* Returns true if the current load factor is exceeded. The load factor of this HashSet is defined as the ratio of
309+
* the number of elements present in this set (cardinality) against the number of buckets (capacity), n/m.
310+
*
311+
* @return true if the current load factor is exceeded, false otherwise.
312+
*/
313+
private boolean isLoadFactorExceeded() {
314+
return this.size() >= this.capacity() * this.LOAD_FACTOR;
315+
}
316+
317+
/**
318+
* The `Tombstone` class is a marker used to represent removed elements in the `HashSet`.
319+
* When an element is removed from the set, its corresponding bucket is marked with a tombstone
320+
* instead of setting it to `null`. This allows the set to differentiate between an empty bucket
321+
* and a bucket that previously contained an element.
322+
*/
323+
private static class Tombstone {
324+
/**The singleton instance of the Tombstone.*/
325+
private static final Tombstone TOMBSTONE = new Tombstone();
326+
/**Private constructor to prevent instantiation of `Tombstone` objects from outside the class.*/
327+
private Tombstone() {}
328+
329+
/**
330+
* Checks if the given object is an instance of Tombstone.
331+
*
332+
* @param obj the object to compare
333+
* @return true if the object is an instance of Tombstone, false otherwise
334+
*/
335+
@Override
336+
public boolean equals(Object obj) {
337+
return obj instanceof HashSet.Tombstone;
338+
}
339+
340+
/**
341+
* Returns the hash code value for the Tombstone object.
342+
*
343+
* @return the hash code value for the Tombstone object
344+
*/
345+
@Override
346+
public int hashCode() {
347+
return System.identityHashCode(this);
348+
}
349+
}
350+
}

0 commit comments

Comments
 (0)