Skip to content

Commit d317169

Browse files
author
Russell
committed
Applied consistent spacing to for, while, and if statements.
1 parent 597f069 commit d317169

File tree

4 files changed

+160
-160
lines changed

4 files changed

+160
-160
lines changed

src/cooccur.c

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ char *vocab_file, *file_head;
6262

6363
/* Efficient string comparison */
6464
int scmp( char *s1, char *s2 ) {
65-
while(*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
65+
while (*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
6666
return(*s1 - *s2);
6767
}
6868

@@ -73,7 +73,7 @@ unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
7373
char c;
7474
unsigned int h;
7575
h = seed;
76-
for(; (c =* word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
76+
for (; (c =* word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
7777
return((unsigned int)((h&0x7fffffff) % tsize));
7878
}
7979

@@ -82,16 +82,16 @@ HASHREC ** inithashtable() {
8282
int i;
8383
HASHREC **ht;
8484
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
85-
for(i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
85+
for (i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
8686
return(ht);
8787
}
8888

8989
/* Search hash table for given string, return record if found, else NULL */
9090
HASHREC *hashsearch(HASHREC **ht, char *w) {
9191
HASHREC *htmp, *hprv;
9292
unsigned int hval = HASHFN(w, TSIZE, SEED);
93-
for(hprv = NULL, htmp=ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
94-
if( htmp != NULL && hprv!=NULL ) { // move to front on access
93+
for (hprv = NULL, htmp=ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
94+
if ( htmp != NULL && hprv!=NULL ) { // move to front on access
9595
hprv->next = htmp->next;
9696
htmp->next = ht[hval];
9797
ht[hval] = htmp;
@@ -103,14 +103,14 @@ HASHREC *hashsearch(HASHREC **ht, char *w) {
103103
void hashinsert(HASHREC **ht, char *w, long long id) {
104104
HASHREC *htmp, *hprv;
105105
unsigned int hval = HASHFN(w, TSIZE, SEED);
106-
for(hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
107-
if(htmp == NULL) {
106+
for (hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
107+
if (htmp == NULL) {
108108
htmp = (HASHREC *) malloc(sizeof(HASHREC));
109109
htmp->word = (char *) malloc(strlen(w) + 1);
110110
strcpy(htmp->word, w);
111111
htmp->id = id;
112112
htmp->next = NULL;
113-
if(hprv == NULL) ht[hval] = htmp;
113+
if (hprv == NULL) ht[hval] = htmp;
114114
else hprv->next = htmp;
115115
}
116116
else fprintf(stderr, "Error, duplicate entry located: %s.\n",htmp->word);
@@ -120,19 +120,19 @@ void hashinsert(HASHREC **ht, char *w, long long id) {
120120
/* Read word from input stream */
121121
int get_word(char *word, FILE *fin) {
122122
int i = 0, ch;
123-
while(!feof(fin)) {
123+
while (!feof(fin)) {
124124
ch = fgetc(fin);
125-
if(ch == 13) continue;
126-
if((ch == ' ') || (ch == '\t') || (ch == '\n')) {
127-
if(i > 0) {
125+
if (ch == 13) continue;
126+
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
127+
if (i > 0) {
128128
if (ch == '\n') ungetc(ch, fin);
129129
break;
130130
}
131131
if (ch == '\n') return 1;
132132
else continue;
133133
}
134134
word[i++] = ch;
135-
if(i >= MAX_STRING_LENGTH - 1) i--; // truncate words that exceed max length
135+
if (i >= MAX_STRING_LENGTH - 1) i--; // truncate words that exceed max length
136136
}
137137
word[i] = 0;
138138
return 0;
@@ -143,8 +143,8 @@ int write_chunk(CREC *cr, long long length, FILE *fout) {
143143
long long a = 0;
144144
CREC old = cr[a];
145145

146-
for(a = 1; a < length; a++) {
147-
if(cr[a].word1 == old.word1 && cr[a].word2 == old.word2) {
146+
for (a = 1; a < length; a++) {
147+
if (cr[a].word1 == old.word1 && cr[a].word2 == old.word2) {
148148
old.val += cr[a].val;
149149
continue;
150150
}
@@ -158,15 +158,15 @@ int write_chunk(CREC *cr, long long length, FILE *fout) {
158158
/* Check if two cooccurrence records are for the same two words, used for qsort */
159159
int compare_crec(const void *a, const void *b) {
160160
int c;
161-
if( (c = ((CREC *) a)->word1 - ((CREC *) b)->word1) != 0) return c;
161+
if ( (c = ((CREC *) a)->word1 - ((CREC *) b)->word1) != 0) return c;
162162
else return (((CREC *) a)->word2 - ((CREC *) b)->word2);
163163

164164
}
165165

166166
/* Check if two cooccurrence records are for the same two words */
167167
int compare_crecid(CRECID a, CRECID b) {
168168
int c;
169-
if( (c = a.word1 - b.word1) != 0) return c;
169+
if ( (c = a.word1 - b.word1) != 0) return c;
170170
else return a.word2 - b.word2;
171171
}
172172

@@ -181,8 +181,8 @@ void swap_entry(CRECID *pq, int i, int j) {
181181
void insert(CRECID *pq, CRECID new, int size) {
182182
int j = size - 1, p;
183183
pq[j] = new;
184-
while( (p=(j-1)/2) >= 0 ) {
185-
if(compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); j = p;}
184+
while ( (p=(j-1)/2) >= 0 ) {
185+
if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); j = p;}
186186
else break;
187187
}
188188
}
@@ -191,18 +191,18 @@ void insert(CRECID *pq, CRECID new, int size) {
191191
void delete(CRECID *pq, int size) {
192192
int j, p = 0;
193193
pq[p] = pq[size - 1];
194-
while( (j = 2*p+1) < size - 1 ) {
195-
if(j == size - 2) {
196-
if(compare_crecid(pq[p],pq[j]) > 0) swap_entry(pq,p,j);
194+
while ( (j = 2*p+1) < size - 1 ) {
195+
if (j == size - 2) {
196+
if (compare_crecid(pq[p],pq[j]) > 0) swap_entry(pq,p,j);
197197
return;
198198
}
199199
else {
200-
if(compare_crecid(pq[j], pq[j+1]) < 0) {
201-
if(compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); p = j;}
200+
if (compare_crecid(pq[j], pq[j+1]) < 0) {
201+
if (compare_crecid(pq[p],pq[j]) > 0) {swap_entry(pq,p,j); p = j;}
202202
else return;
203203
}
204204
else {
205-
if(compare_crecid(pq[p],pq[j+1]) > 0) {swap_entry(pq,p,j+1); p = j + 1;}
205+
if (compare_crecid(pq[p],pq[j+1]) > 0) {swap_entry(pq,p,j+1); p = j + 1;}
206206
else return;
207207
}
208208
}
@@ -211,7 +211,7 @@ void delete(CRECID *pq, int size) {
211211

212212
/* Write top node of priority queue to file, accumulating duplicate entries */
213213
int merge_write(CRECID new, CRECID *old, FILE *fout) {
214-
if(new.word1 == old->word1 && new.word2 == old->word2) {
214+
if (new.word1 == old->word1 && new.word2 == old->word2) {
215215
old->val += new.val;
216216
return 0; // Indicates duplicate entry
217217
}
@@ -230,13 +230,13 @@ int merge_files(int num) {
230230
fid = malloc(sizeof(FILE) * num);
231231
pq = malloc(sizeof(CRECID) * num);
232232
fout = stdout;
233-
if(verbose > 1) fprintf(stderr, "Merging cooccurrence files: processed 0 lines.");
233+
if (verbose > 1) fprintf(stderr, "Merging cooccurrence files: processed 0 lines.");
234234

235235
/* Open all files and add first entry of each to priority queue */
236-
for(i = 0; i < num; i++) {
236+
for (i = 0; i < num; i++) {
237237
sprintf(filename,"%s_%04d.bin",file_head,i);
238238
fid[i] = fopen(filename,"rb");
239-
if(fid[i] == NULL) {fprintf(stderr, "Unable to open file %s.\n",filename); return 1;}
239+
if (fid[i] == NULL) {fprintf(stderr, "Unable to open file %s.\n",filename); return 1;}
240240
fread(&new, sizeof(CREC), 1, fid[i]);
241241
new.id = i;
242242
insert(pq,new,i+1);
@@ -248,28 +248,28 @@ int merge_files(int num) {
248248
i = pq[0].id;
249249
delete(pq, size);
250250
fread(&new, sizeof(CREC), 1, fid[i]);
251-
if(feof(fid[i])) size--;
251+
if (feof(fid[i])) size--;
252252
else {
253253
new.id = i;
254254
insert(pq, new, size);
255255
}
256256

257257
/* Repeatedly pop top node and fill priority queue until files have reached EOF */
258-
while(size > 0) {
258+
while (size > 0) {
259259
counter += merge_write(pq[0], &old, fout); // Only count the lines written to file, not duplicates
260-
if((counter%100000) == 0) if(verbose > 1) fprintf(stderr,"\033[39G%lld lines.",counter);
260+
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[39G%lld lines.",counter);
261261
i = pq[0].id;
262262
delete(pq, size);
263263
fread(&new, sizeof(CREC), 1, fid[i]);
264-
if(feof(fid[i])) size--;
264+
if (feof(fid[i])) size--;
265265
else {
266266
new.id = i;
267267
insert(pq, new, size);
268268
}
269269
}
270270
fwrite(&old, sizeof(CREC), 1, fout);
271271
fprintf(stderr,"\033[0GMerging cooccurrence files: processed %lld lines.\n",++counter);
272-
for(i=0;i<num;i++) {
272+
for (i=0;i<num;i++) {
273273
sprintf(filename,"%s_%04d.bin",file_head,i);
274274
remove(filename);
275275
}
@@ -289,22 +289,22 @@ int get_cooccurrence() {
289289
history = malloc(sizeof(long long) * window_size);
290290

291291
fprintf(stderr, "COUNTING COOCCURRENCES\n");
292-
if(verbose > 0) {
292+
if (verbose > 0) {
293293
fprintf(stderr, "window size: %d\n", window_size);
294-
if(symmetric == 0) fprintf(stderr, "context: asymmetric\n");
294+
if (symmetric == 0) fprintf(stderr, "context: asymmetric\n");
295295
else fprintf(stderr, "context: symmetric\n");
296296
}
297-
if(verbose > 1) fprintf(stderr, "max product: %lld\n", max_product);
298-
if(verbose > 1) fprintf(stderr, "overflow length: %lld\n", overflow_length);
297+
if (verbose > 1) fprintf(stderr, "max product: %lld\n", max_product);
298+
if (verbose > 1) fprintf(stderr, "overflow length: %lld\n", overflow_length);
299299
sprintf(format,"%%%ds %%lld", MAX_STRING_LENGTH); // Format to read from vocab file, which has (irrelevant) frequency data
300-
if(verbose > 1) fprintf(stderr, "Reading vocab from file \"%s\"...", vocab_file);
300+
if (verbose > 1) fprintf(stderr, "Reading vocab from file \"%s\"...", vocab_file);
301301
fid = fopen(vocab_file,"r");
302-
if(fid == NULL) {fprintf(stderr,"Unable to open vocab file %s.\n",vocab_file); return 1;}
303-
while(fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
302+
if (fid == NULL) {fprintf(stderr,"Unable to open vocab file %s.\n",vocab_file); return 1;}
303+
while (fscanf(fid, format, str, &id) != EOF) hashinsert(vocab_hash, str, ++j); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
304304
fclose(fid);
305305
vocab_size = j;
306306
j = 0;
307-
if(verbose > 1) fprintf(stderr, "loaded %lld words.\nBuilding lookup table...", vocab_size);
307+
if (verbose > 1) fprintf(stderr, "loaded %lld words.\nBuilding lookup table...", vocab_size);
308308

309309
/* Build auxiliary lookup table used to index into bigram_table */
310310
lookup = (long long *)calloc( vocab_size + 1, sizeof(long long) );
@@ -313,11 +313,11 @@ int get_cooccurrence() {
313313
return 1;
314314
}
315315
lookup[0] = 1;
316-
for(a = 1; a <= vocab_size; a++) {
317-
if((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1];
316+
for (a = 1; a <= vocab_size; a++) {
317+
if ((lookup[a] = max_product / a) < vocab_size) lookup[a] += lookup[a-1];
318318
else lookup[a] = lookup[a-1] + vocab_size;
319319
}
320-
if(verbose > 1) fprintf(stderr, "table contains %lld elements.\n",lookup[a-1]);
320+
if (verbose > 1) fprintf(stderr, "table contains %lld elements.\n",lookup[a-1]);
321321

322322
/* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */
323323
bigram_table = (real *)calloc( lookup[a-1] , sizeof(real) );
@@ -330,11 +330,11 @@ int get_cooccurrence() {
330330
sprintf(format,"%%%ds",MAX_STRING_LENGTH);
331331
sprintf(filename,"%s_%04d.bin",file_head, fidcounter);
332332
foverflow = fopen(filename,"w");
333-
if(verbose > 1) fprintf(stderr,"Processing token: 0");
333+
if (verbose > 1) fprintf(stderr,"Processing token: 0");
334334

335335
/* For each token in input stream, calculate a weighted cooccurrence sum within window_size */
336336
while (1) {
337-
if(ind >= overflow_length - window_size) { // If overflow buffer is (almost) full, sort it and write it to temporary file
337+
if (ind >= overflow_length - window_size) { // If overflow buffer is (almost) full, sort it and write it to temporary file
338338
qsort(cr, ind, sizeof(CREC), compare_crec);
339339
write_chunk(cr,ind,foverflow);
340340
fclose(foverflow);
@@ -344,25 +344,25 @@ int get_cooccurrence() {
344344
ind = 0;
345345
}
346346
flag = get_word(str, fid);
347-
if(feof(fid)) break;
348-
if(flag == 1) {j = 0; continue;} // Newline, reset line index (j)
347+
if (feof(fid)) break;
348+
if (flag == 1) {j = 0; continue;} // Newline, reset line index (j)
349349
counter++;
350-
if((counter%100000) == 0) if(verbose > 1) fprintf(stderr,"\033[19G%lld",counter);
350+
if ((counter%100000) == 0) if (verbose > 1) fprintf(stderr,"\033[19G%lld",counter);
351351
htmp = hashsearch(vocab_hash, str);
352352
if (htmp == NULL) continue; // Skip out-of-vocabulary words
353353
w2 = htmp->id; // Target word (frequency rank)
354-
for(k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line
354+
for (k = j - 1; k >= ( (j > window_size) ? j - window_size : 0 ); k--) { // Iterate over all words to the left of target word, but not past beginning of line
355355
w1 = history[k % window_size]; // Context word (frequency rank)
356356
if ( w1 < max_product/w2 ) { // Product is small enough to store in a full array
357357
bigram_table[lookup[w1-1] + w2 - 2] += 1.0/((real)(j-k)); // Weight by inverse of distance between words
358-
if(symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += 1.0/((real)(j-k)); // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
358+
if (symmetric > 0) bigram_table[lookup[w2-1] + w1 - 2] += 1.0/((real)(j-k)); // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
359359
}
360360
else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full.
361361
cr[ind].word1 = w1;
362362
cr[ind].word2 = w2;
363363
cr[ind].val = 1.0/((real)(j-k));
364364
ind++; // Keep track of how full temporary buffer is
365-
if(symmetric > 0) { // Symmetric context
365+
if (symmetric > 0) { // Symmetric context
366366
cr[ind].word1 = w2;
367367
cr[ind].word2 = w1;
368368
cr[ind].val = 1.0/((real)(j-k));
@@ -375,27 +375,27 @@ int get_cooccurrence() {
375375
}
376376

377377
/* Write out temp buffer for the final time (it may not be full) */
378-
if(verbose > 1) fprintf(stderr,"\033[0GProcessed %lld tokens.\n",counter);
378+
if (verbose > 1) fprintf(stderr,"\033[0GProcessed %lld tokens.\n",counter);
379379
qsort(cr, ind, sizeof(CREC), compare_crec);
380380
write_chunk(cr,ind,foverflow);
381381
sprintf(filename,"%s_0000.bin",file_head);
382382

383383
/* Write out full bigram_table, skipping zeros */
384-
if(verbose > 1) fprintf(stderr, "Writing cooccurrences to disk");
384+
if (verbose > 1) fprintf(stderr, "Writing cooccurrences to disk");
385385
fid = fopen(filename,"w");
386386
j = 1e6;
387-
for(x = 1; x <= vocab_size; x++) {
388-
if( (long long) (0.75*log(vocab_size / x)) < j) {j = (long long) (0.75*log(vocab_size / x)); if(verbose > 1) fprintf(stderr,".");} // log's to make it look (sort of) pretty
389-
for(y = 1; y <= (lookup[x] - lookup[x-1]); y++) {
390-
if((r = bigram_table[lookup[x-1] - 2 + y]) != 0) {
387+
for (x = 1; x <= vocab_size; x++) {
388+
if ( (long long) (0.75*log(vocab_size / x)) < j) {j = (long long) (0.75*log(vocab_size / x)); if (verbose > 1) fprintf(stderr,".");} // log's to make it look (sort of) pretty
389+
for (y = 1; y <= (lookup[x] - lookup[x-1]); y++) {
390+
if ((r = bigram_table[lookup[x-1] - 2 + y]) != 0) {
391391
fwrite(&x, sizeof(int), 1, fid);
392392
fwrite(&y, sizeof(int), 1, fid);
393393
fwrite(&r, sizeof(real), 1, fid);
394394
}
395395
}
396396
}
397397

398-
if(verbose > 1) fprintf(stderr,"%d files in total.\n",fidcounter + 1);
398+
if (verbose > 1) fprintf(stderr,"%d files in total.\n",fidcounter + 1);
399399
fclose(fid);
400400
fclose(foverflow);
401401
free(cr);
@@ -408,7 +408,7 @@ int get_cooccurrence() {
408408
int find_arg(char *str, int argc, char **argv) {
409409
int i;
410410
for (i = 1; i < argc; i++) {
411-
if(!scmp(str, argv[i])) {
411+
if (!scmp(str, argv[i])) {
412412
if (i == argc - 1) {
413413
printf("No argument given for %s\n", str);
414414
exit(1);
@@ -463,7 +463,7 @@ int main(int argc, char **argv) {
463463
/* The memory_limit determines a limit on the number of elements in bigram_table and the overflow buffer */
464464
/* Estimate the maximum value that max_product can take so that this limit is still satisfied */
465465
rlimit = 0.85 * (real)memory_limit * 1073741824/(sizeof(CREC));
466-
while(fabs(rlimit - n * (log(n) + 0.1544313298)) > 1e-3) n = rlimit / (log(n) + 0.1544313298);
466+
while (fabs(rlimit - n * (log(n) + 0.1544313298)) > 1e-3) n = rlimit / (log(n) + 0.1544313298);
467467
max_product = (long long) n;
468468
overflow_length = (long long) rlimit/6; // 0.85 + 1/6 ~= 1
469469

0 commit comments

Comments
 (0)