-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathferrero.c
More file actions
118 lines (112 loc) · 5.06 KB
/
ferrero.c
File metadata and controls
118 lines (112 loc) · 5.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <sys/mman.h>
#define NOUN_PATH "./Wordlist-Nouns-All.txt"
#define NOUN_LENGTH 90960
#define VERB_PATH "./verbs_inf.txt"
#define VERB_LENGTH 6057
#define DICTIONARY_LENGTH (NOUN_LENGTH+VERB_LENGTH)
#define BUFFER_SIZE (64*1024) // Align to page size while still being big enough.
#define HASHMAP_LEN (DICTIONARY_LENGTH*100) // Chosen through ~~science~~ trial and error.
// djb2 hash
// http://www.cse.yorku.ca/~oz/hash.html
size_t hash(char* p, size_t sz) {
unsigned long hash = 5381;
for (int i = 0; i < sz; i++) hash += (hash << 5) + *p++;
return hash;
}
void generate(char* path, char* buf, char* incomplete_word, char* exists, char* type) {
size_t bytes_read = 0;
int fd = open(path, O_RDONLY | O_NONBLOCK); // Non-blocking read
// Tell the kernel we're immediately going to read sequentially with no reuse.
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL | POSIX_FADV_NOREUSE | POSIX_FADV_WILLNEED);
while((bytes_read = read(fd, buf, BUFFER_SIZE))) {
if (bytes_read == 0) break;
bool first = true;
char* p = buf;
while (true) {
// Find the end of the line.
char* bound = memchr(p, '\n', (buf + bytes_read) - p);
int len = bound - p;
if (bound == 0x0) break;
size_t true_index;
first = false;
strncat(incomplete_word, p, len+1);
size_t word_len = strlen(incomplete_word);
true_index = hash(incomplete_word, word_len) % HASHMAP_LEN;
memset(incomplete_word, 0, 45);
}
else true_index = (uint64_t)hash(p, len) % (uint64_t)HASHMAP_LEN;
// Don't bother writing 0s to represent nouns since the buffer is calloc'd
if (type != 0x0) type[true_index/8] |= 1 << (8-(true_index%8));
// Write a 1 to show it exists
exists[true_index/8] |= 1 << (8-(true_index%8));
p += len+1;
}
// Handle incomplete word at end of buffer by saving it to an array.
if (p != buf + bytes_read) memcpy(incomplete_word, p, buf+bytes_read-p);
}
}
int main() {
// Allocate hashmaps. Everything we care about (word existence, type, validity) is
// representable by a bit, so pack them into ints for a much lower memory cost.
// Then, since they're hashmaps, tell the kernel we're going to access them randomly.
char* exists = calloc(HASHMAP_LEN/8, 1);
madvise(exists, HASHMAP_LEN/8, MADV_RANDOM | MADV_WILLNEED);
char* type = calloc(HASHMAP_LEN/8, 1);
madvise(type, HASHMAP_LEN/8, MADV_RANDOM | MADV_WILLNEED);
char* valid = calloc(HASHMAP_LEN/8, 1);
madvise(valid, HASHMAP_LEN/8, MADV_RANDOM | MADV_WILLNEED);
// Allocate buffers necessary for buffered read of file.
char* buf = malloc(BUFFER_SIZE+1);
char incomplete_word[45] = {0};
generate(VERB_PATH, buf, incomplete_word, exists, type);
generate(NOUN_PATH, buf, incomplete_word, exists, 0x0);
// Iterate over nouns wordlist one more time to check validity. Mostly same I/O logic.
// FIXME Incomplete words not handled here.
int fd = open(NOUN_PATH, O_RDONLY | O_NONBLOCK);
posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL | POSIX_FADV_WILLNEED | POSIX_FADV_NOREUSE);
size_t bytes_read = 0;
while((bytes_read = read(fd, buf, BUFFER_SIZE))) {
bool nasty_error = false;
if (!bytes_read) break;
char *p = buf;
while (true) {
char* bound = memchr(p, '\n', (buf + bytes_read) - p);
int len = bound - p;
if (bound == 0x0) break;
else if (len <= 0) {
nasty_error = true;
break;
}
// Check all possible splits
for (size_t i = 1; i < len; i++) {
size_t pre_index = hash(p, i) % HASHMAP_LEN;
size_t post_index = hash(p+i, len-i) % HASHMAP_LEN;
// Check that both subwords exist and are different types
if ((exists[pre_index/8] & (1 << (8-(pre_index%8)))) == (1 << (8-(pre_index%8))) &&
(exists[post_index/8] & (1 << (8-(post_index%8)))) == (1 << (8-(post_index%8))) &&
(type[pre_index/8] & (1 << (8-(pre_index%8)))) != (type[post_index/8] & (1 << (8-(post_index%8))))) {
// Print combination out and update validity hashmap.
size_t true_index = hash(p, len) % HASHMAP_LEN;
for (int j = 0; j < i; j++) {
printf("%c", *(p+j));
}
printf(" ");
for (int j = 0; j < len-i; j++) {
printf("%c", *(p+i+j));
}
printf("\n");
valid[true_index/8] |= 1 << (8-(true_index%8));
}
}
p += len+1;
}
if (p != buf + bytes_read && !nasty_error) memcpy(incomplete_word, p, buf+bytes_read-p);
nasty_error = false;
}
}