Skip to content

Commit c7e9af2

Browse files
committed
Adding dynamic table sizing and refactoring tokenize logic
1 parent 6bb9383 commit c7e9af2

File tree

3 files changed

+217
-126
lines changed

3 files changed

+217
-126
lines changed

includes/hash_table.c

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#include "hash_table.h"
2+
#include <stdio.h>
3+
#include <stdlib.h>
4+
#include <string.h>
5+
6+
void init_table(hash_table *hash_table) {
7+
token_t *table = (token_t *)hash_table->table;
8+
for (size_t i = 0; i < hash_table->capacity; ++i) {
9+
table[i].key = NULL;
10+
table[i].value = 0;
11+
}
12+
}
13+
14+
size_t hash(char *str, size_t capacity) {
15+
unsigned long hash = 5381;
16+
int c;
17+
18+
while ((c = *str++)) {
19+
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
20+
}
21+
if (capacity > 0) {
22+
return hash % capacity;
23+
}
24+
return 0;
25+
}
26+
27+
void *tokenize(char *input, hash_table *hash_table) {
28+
if (input == NULL)
29+
return NULL;
30+
char *str = strdup(input);
31+
if (!str)
32+
return NULL;
33+
34+
const char *delimiters = " \n\r\t";
35+
char *token;
36+
char *rest = str;
37+
size_t collisions_count = 0;
38+
size_t token_count = 0;
39+
size_t unique_tokens = 0;
40+
41+
printf("\nTokens:\n");
42+
43+
while ((token = strtok_r(rest, delimiters, &rest)) != NULL) {
44+
token_count++;
45+
if (hash_table->size >= hash_table->capacity) {
46+
size_t new_capacity = hash_table->capacity * 2;
47+
if (new_capacity == 0) {
48+
new_capacity = 8;
49+
}
50+
token_t *new_table = malloc(new_capacity * sizeof(token_t));
51+
if (!new_table) {
52+
free(str);
53+
void *analyzer_ptr = malloc(sizeof(token_analysis_t));
54+
token_analysis_t res = {.global_token_count = token_count,
55+
.collisions_count = collisions_count,
56+
.unique_tokens = unique_tokens};
57+
memcpy(analyzer_ptr, &res, sizeof(res));
58+
return analyzer_ptr;
59+
}
60+
for (size_t i = 0; i < new_capacity; i++) {
61+
new_table[i].key = NULL;
62+
new_table[i].value = 0;
63+
}
64+
65+
for (size_t i = 0; i < hash_table->capacity; i++) {
66+
token_t *old_entry = &((token_t*)hash_table->table)[i];
67+
if (old_entry->key != NULL) {
68+
size_t new_key = hash(old_entry->key, new_capacity);
69+
size_t original_new_key = new_key;
70+
while (new_table[new_key].key != NULL) {
71+
new_key = (new_key + 1) % new_capacity;
72+
if (new_key == original_new_key) {
73+
break;
74+
}
75+
}
76+
new_table[new_key].key = old_entry->key;
77+
new_table[new_key].value = old_entry->value;
78+
new_table[new_key].hash_key = new_key;
79+
}
80+
}
81+
free(hash_table->table);
82+
hash_table->table = new_table;
83+
hash_table->capacity = new_capacity;
84+
}
85+
token_t *table = (token_t *)hash_table->table;
86+
size_t key = hash(token, hash_table->capacity);
87+
size_t original_key = key;
88+
while (table[key].key != NULL && strcmp(table[key].key, token) != 0) {
89+
key = (key + 1) % hash_table->capacity;
90+
if (key == original_key) {
91+
break;
92+
}
93+
}
94+
if (table[key].key == NULL) {
95+
table[key].key = strdup(token);
96+
table[key].value = 1;
97+
table[key].hash_key = key;
98+
hash_table->size += 1;
99+
unique_tokens++;
100+
}
101+
if (key != original_key) {
102+
collisions_count++;
103+
} else {
104+
table[key].value += 1;
105+
}
106+
}
107+
free(str);
108+
void *analyzer_ptr = malloc(sizeof(token_analysis_t));
109+
token_analysis_t res = {.global_token_count = token_count,
110+
.collisions_count = collisions_count,
111+
.unique_tokens = unique_tokens};
112+
memcpy(analyzer_ptr, &res, sizeof(res));
113+
return analyzer_ptr;
114+
}
115+
116+
void free_table(token_t *table, size_t capacity) {
117+
for (size_t i = 0; i < capacity; i++) {
118+
if (table[i].key != NULL) {
119+
free(table[i].key);
120+
table[i].key = NULL;
121+
}
122+
}
123+
}
124+
125+
void naive(char *input, hash_table *hash_table) {
126+
if (input == NULL)
127+
return;
128+
char *str = strdup(input);
129+
if (!str)
130+
return;
131+
132+
const char *delimiters = " \n\r\t";
133+
char *token;
134+
char *rest = str;
135+
token_t *naive_table = (token_t *)hash_table->table;
136+
137+
while ((token = strtok_r(rest, delimiters, &rest)) != NULL) {
138+
size_t i;
139+
for (i = 0; i < hash_table->capacity; ++i) {
140+
if (naive_table[i].key == NULL) {
141+
naive_table[i].key = strdup(token);
142+
if (naive_table[i].key == NULL) {
143+
continue;
144+
}
145+
naive_table[i].value = 1;
146+
hash_table->size++;
147+
break;
148+
} else if (strcmp(naive_table[i].key, token) == 0) {
149+
naive_table[i].value += 1;
150+
break;
151+
}
152+
}
153+
}
154+
free(str);
155+
}

includes/hash_table.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#pragma once
2+
#include <stdlib.h>
3+
4+
#define TABLE_SIZE 100000
5+
6+
typedef struct {
7+
char *key;
8+
size_t value;
9+
size_t hash_key;
10+
} token_t;
11+
12+
typedef struct {
13+
size_t collisions_count;
14+
size_t global_token_count;
15+
size_t unique_tokens;
16+
} token_analysis_t;
17+
18+
typedef struct {
19+
void *table;
20+
size_t size;
21+
size_t capacity;
22+
} hash_table;
23+
24+
void init_table(hash_table *hash_table);
25+
size_t hash(char *str, size_t capacity);
26+
void *tokenize(char *input, hash_table *table);
27+
void free_table(token_t *table, size_t capacity);
28+
void naive(char *input, hash_table* hash_table);

main.c

Lines changed: 34 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,11 @@
1+
#include "includes/hash_table.c"
12
#include <stddef.h>
23
#include <stdint.h>
34
#include <stdio.h>
45
#include <stdlib.h>
56
#include <string.h>
67
#include <time.h>
78

8-
typedef struct {
9-
char *key;
10-
size_t value;
11-
size_t hash_key;
12-
} token_t;
13-
14-
typedef struct {
15-
size_t collisions_count;
16-
size_t global_token_count;
17-
size_t unique_tokens;
18-
} token_analysis_t;
19-
20-
#define TABLE_SIZE 100000
21-
22-
token_t table[TABLE_SIZE];
23-
token_t naive_table[TABLE_SIZE];
24-
25-
void init_table(token_t *table) {
26-
for (size_t i = 0; i < TABLE_SIZE; ++i) {
27-
table[i].key = NULL;
28-
table[i].value = 0;
29-
}
30-
}
31-
32-
size_t hash(char *str) {
33-
unsigned long hash = 5381;
34-
int c;
35-
36-
while ((c = *str++)) {
37-
hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
38-
}
39-
40-
return hash % TABLE_SIZE;
41-
}
42-
439
char *read_file(char *path, char *mode) {
4410
FILE *fptr;
4511
fptr = fopen(path, mode);
@@ -66,81 +32,14 @@ char *read_file(char *path, char *mode) {
6632
return content;
6733
}
6834

69-
void naive(char *input) {
70-
if (input == NULL)
71-
return;
72-
char *str = strdup(input);
73-
const char *delimiters = " \n\r\t";
74-
char *token;
75-
char *rest = str;
76-
if (!str)
77-
return;
78-
while ((token = strtok_r(rest, delimiters, &rest)) != NULL) {
79-
for (size_t i = 0; i < TABLE_SIZE; ++i) {
80-
if (naive_table[i].key == NULL ||
81-
strcmp(naive_table[i].key, token) == 0) {
82-
naive_table[i].key = strdup(token);
83-
naive_table[i].value += 1;
84-
break;
85-
}
86-
}
87-
}
88-
free(str);
89-
}
90-
91-
void *tokenize(char *input) {
92-
if (input == NULL)
93-
return NULL;
94-
char *str = strdup(input);
95-
if (!str)
96-
return NULL;
97-
98-
const char *delimiters = " \n\r\t";
99-
char *token;
100-
char *rest = str;
101-
size_t collisions_count = 0;
102-
size_t token_count = 0;
103-
size_t unique_tokens = 0;
104-
105-
printf("\nTokens:\n");
106-
107-
while ((token = strtok_r(rest, delimiters, &rest)) != NULL) {
108-
token_count++;
109-
size_t key = hash(token);
110-
111-
if (table[key].key == NULL) {
112-
table[key].key = strdup(token);
113-
table[key].value = 1;
114-
table[key].hash_key = key;
115-
unique_tokens++;
116-
} else if (table[key].key != NULL && strcmp(table[key].key, token) == 0) {
117-
table[key].value += 1;
118-
} else {
119-
key = (key + 1) % TABLE_SIZE;
120-
table[key].key = strdup(token);
121-
table[key].value = 1;
122-
table[key].hash_key = key;
123-
collisions_count++;
124-
}
125-
}
126-
127-
free(str);
128-
void *analyzer_ptr = malloc(sizeof(token_analysis_t));
129-
token_analysis_t res = {.global_token_count = token_count,
130-
.collisions_count = collisions_count,
131-
.unique_tokens = unique_tokens};
132-
memcpy(analyzer_ptr, &res, sizeof(res));
133-
return analyzer_ptr;
134-
}
135-
13635
int comp(const void *elem1, const void *elem2) {
13736
int f = ((token_t *)elem1)->value;
13837
int s = ((token_t *)elem2)->value;
13938
return s - f;
14039
}
14140

142-
void print_table(size_t top, token_t *table) {
143-
qsort(table, TABLE_SIZE, sizeof(token_t), comp);
41+
void print_table(size_t top, token_t *table, size_t table_size) {
42+
qsort(table, table_size, sizeof(token_t), comp);
14443
printf("Top %zu entries:\n", top);
14544
for (size_t i = 0; i < top; ++i) {
14645
if (table[i].key != NULL) {
@@ -150,32 +49,24 @@ void print_table(size_t top, token_t *table) {
15049
}
15150
}
15251

153-
void free_table() {
154-
for (size_t i = 0; i < TABLE_SIZE; i++) {
155-
if (table[i].key != NULL) {
156-
free(table[i].key);
157-
table[i].key = NULL;
158-
}
159-
}
160-
}
161-
162-
void test_hash(char *content) {
52+
void test_hash(char *content, hash_table *hash_table) {
16353
clock_t start = clock();
164-
token_analysis_t *collisions = (token_analysis_t *)tokenize(content);
54+
token_analysis_t *collisions =
55+
(token_analysis_t *)tokenize(content, hash_table);
16556

166-
print_table(10, table);
57+
print_table(10, (token_t *)hash_table->table, hash_table->capacity);
16758
clock_t end = clock();
16859
printf("\nCollisions: %zu\nTotal tokens parsed: %zu, Unique tokens: %zu",
16960
collisions->collisions_count, collisions->global_token_count,
17061
collisions->unique_tokens);
171-
free(collisions);
17262
printf("\nTime elapsed: %f", (float)(end - start) / CLOCKS_PER_SEC);
63+
free(collisions);
17364
}
17465

175-
void test_naive(char *content) {
66+
void test_naive(char *content, hash_table *naive_table) {
17667
clock_t start = clock();
177-
naive(content);
178-
print_table(10, naive_table);
68+
naive(content, naive_table);
69+
print_table(10, (token_t *)naive_table->table, naive_table->capacity);
17970
clock_t end = clock();
18071
printf("\nTime elapsed: %f", (float)(end - start) / CLOCKS_PER_SEC);
18172
}
@@ -186,11 +77,28 @@ int main(int argc, char **argv) {
18677
return 1;
18778
if (argc < 2)
18879
return 1;
189-
init_table(table);
190-
init_table(naive_table);
191-
192-
test_naive(content);
193-
test_hash(content);
80+
void *table = malloc(TABLE_SIZE * sizeof(token_t));
81+
void *naive_table = malloc(TABLE_SIZE * sizeof(token_t));
82+
hash_table hash_table_impl = {
83+
.table = table,
84+
.capacity = TABLE_SIZE,
85+
.size = 0,
86+
};
87+
88+
// hash_table naive_hash_table = {
89+
// .table = naive_table,
90+
// .capacity = TABLE_SIZE,
91+
// .size = 0,
92+
// };
93+
94+
init_table(&hash_table_impl);
95+
// init_table(&naive_hash_table);
96+
97+
// test_naive(content, &naive_hash_table);
98+
test_hash(content, &hash_table_impl);
19499
free(content);
195-
free_table();
100+
free_table((token_t *)hash_table_impl.table, hash_table_impl.capacity);
101+
// free_table((token_t *)naive_hash_table.table, naive_hash_table.capacity);
102+
free(hash_table_impl.table);
103+
// free(naive_hash_table.table);
196104
}

0 commit comments

Comments
 (0)