@@ -62,7 +62,7 @@ char *vocab_file, *file_head;
6262
6363/* Efficient string comparison */
6464int scmp ( char * s1 , char * s2 ) {
65- while (* s1 != '\0' && * s1 == * s2 ) {s1 ++ ; s2 ++ ;}
65+ while (* s1 != '\0' && * s1 == * s2 ) {s1 ++ ; s2 ++ ;}
6666 return (* s1 - * s2 );
6767}
6868
@@ -73,7 +73,7 @@ unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
7373 char c ;
7474 unsigned int h ;
7575 h = seed ;
76- for (; (c = * word ) != '\0' ; word ++ ) h ^= ((h << 5 ) + c + (h >> 2 ));
76+ for (; (c = * word ) != '\0' ; word ++ ) h ^= ((h << 5 ) + c + (h >> 2 ));
7777 return ((unsigned int )((h & 0x7fffffff ) % tsize ));
7878}
7979
@@ -82,16 +82,16 @@ HASHREC ** inithashtable() {
8282 int i ;
8383 HASHREC * * ht ;
8484 ht = (HASHREC * * ) malloc ( sizeof (HASHREC * ) * TSIZE );
85- for (i = 0 ; i < TSIZE ; i ++ ) ht [i ] = (HASHREC * ) NULL ;
85+ for (i = 0 ; i < TSIZE ; i ++ ) ht [i ] = (HASHREC * ) NULL ;
8686 return (ht );
8787}
8888
8989/* Search hash table for given string, return record if found, else NULL */
9090HASHREC * hashsearch (HASHREC * * ht , char * w ) {
9191 HASHREC * htmp , * hprv ;
9292 unsigned int hval = HASHFN (w , TSIZE , SEED );
93- for (hprv = NULL , htmp = ht [hval ]; htmp != NULL && scmp (htmp -> word , w ) != 0 ; hprv = htmp , htmp = htmp -> next );
94- if ( htmp != NULL && hprv != NULL ) { // move to front on access
93+ for (hprv = NULL , htmp = ht [hval ]; htmp != NULL && scmp (htmp -> word , w ) != 0 ; hprv = htmp , htmp = htmp -> next );
94+ if ( htmp != NULL && hprv != NULL ) { // move to front on access
9595 hprv -> next = htmp -> next ;
9696 htmp -> next = ht [hval ];
9797 ht [hval ] = htmp ;
@@ -103,14 +103,14 @@ HASHREC *hashsearch(HASHREC **ht, char *w) {
103103void hashinsert (HASHREC * * ht , char * w , long long id ) {
104104 HASHREC * htmp , * hprv ;
105105 unsigned int hval = HASHFN (w , TSIZE , SEED );
106- for (hprv = NULL , htmp = ht [hval ]; htmp != NULL && scmp (htmp -> word , w ) != 0 ; hprv = htmp , htmp = htmp -> next );
107- if (htmp == NULL ) {
106+ for (hprv = NULL , htmp = ht [hval ]; htmp != NULL && scmp (htmp -> word , w ) != 0 ; hprv = htmp , htmp = htmp -> next );
107+ if (htmp == NULL ) {
108108 htmp = (HASHREC * ) malloc (sizeof (HASHREC ));
109109 htmp -> word = (char * ) malloc (strlen (w ) + 1 );
110110 strcpy (htmp -> word , w );
111111 htmp -> id = id ;
112112 htmp -> next = NULL ;
113- if (hprv == NULL ) ht [hval ] = htmp ;
113+ if (hprv == NULL ) ht [hval ] = htmp ;
114114 else hprv -> next = htmp ;
115115 }
116116 else fprintf (stderr , "Error, duplicate entry located: %s.\n" ,htmp -> word );
@@ -120,19 +120,19 @@ void hashinsert(HASHREC **ht, char *w, long long id) {
120120/* Read word from input stream */
121121int get_word (char * word , FILE * fin ) {
122122 int i = 0 , ch ;
123- while (!feof (fin )) {
123+ while (!feof (fin )) {
124124 ch = fgetc (fin );
125- if (ch == 13 ) continue ;
126- if ((ch == ' ' ) || (ch == '\t' ) || (ch == '\n' )) {
127- if (i > 0 ) {
125+ if (ch == 13 ) continue ;
126+ if ((ch == ' ' ) || (ch == '\t' ) || (ch == '\n' )) {
127+ if (i > 0 ) {
128128 if (ch == '\n' ) ungetc (ch , fin );
129129 break ;
130130 }
131131 if (ch == '\n' ) return 1 ;
132132 else continue ;
133133 }
134134 word [i ++ ] = ch ;
135- if (i >= MAX_STRING_LENGTH - 1 ) i -- ; // truncate words that exceed max length
135+ if (i >= MAX_STRING_LENGTH - 1 ) i -- ; // truncate words that exceed max length
136136 }
137137 word [i ] = 0 ;
138138 return 0 ;
@@ -143,8 +143,8 @@ int write_chunk(CREC *cr, long long length, FILE *fout) {
143143 long long a = 0 ;
144144 CREC old = cr [a ];
145145
146- for (a = 1 ; a < length ; a ++ ) {
147- if (cr [a ].word1 == old .word1 && cr [a ].word2 == old .word2 ) {
146+ for (a = 1 ; a < length ; a ++ ) {
147+ if (cr [a ].word1 == old .word1 && cr [a ].word2 == old .word2 ) {
148148 old .val += cr [a ].val ;
149149 continue ;
150150 }
@@ -158,15 +158,15 @@ int write_chunk(CREC *cr, long long length, FILE *fout) {
158158/* Check if two cooccurrence records are for the same two words, used for qsort */
159159int compare_crec (const void * a , const void * b ) {
160160 int c ;
161- if ( (c = ((CREC * ) a )-> word1 - ((CREC * ) b )-> word1 ) != 0 ) return c ;
161+ if ( (c = ((CREC * ) a )-> word1 - ((CREC * ) b )-> word1 ) != 0 ) return c ;
162162 else return (((CREC * ) a )-> word2 - ((CREC * ) b )-> word2 );
163163
164164}
165165
166166/* Check if two cooccurrence records are for the same two words */
167167int compare_crecid (CRECID a , CRECID b ) {
168168 int c ;
169- if ( (c = a .word1 - b .word1 ) != 0 ) return c ;
169+ if ( (c = a .word1 - b .word1 ) != 0 ) return c ;
170170 else return a .word2 - b .word2 ;
171171}
172172
@@ -181,8 +181,8 @@ void swap_entry(CRECID *pq, int i, int j) {
181181void insert (CRECID * pq , CRECID new , int size ) {
182182 int j = size - 1 , p ;
183183 pq [j ] = new ;
184- while ( (p = (j - 1 )/2 ) >= 0 ) {
185- if (compare_crecid (pq [p ],pq [j ]) > 0 ) {swap_entry (pq ,p ,j ); j = p ;}
184+ while ( (p = (j - 1 )/2 ) >= 0 ) {
185+ if (compare_crecid (pq [p ],pq [j ]) > 0 ) {swap_entry (pq ,p ,j ); j = p ;}
186186 else break ;
187187 }
188188}
@@ -191,18 +191,18 @@ void insert(CRECID *pq, CRECID new, int size) {
191191void delete (CRECID * pq , int size ) {
192192 int j , p = 0 ;
193193 pq [p ] = pq [size - 1 ];
194- while ( (j = 2 * p + 1 ) < size - 1 ) {
195- if (j == size - 2 ) {
196- if (compare_crecid (pq [p ],pq [j ]) > 0 ) swap_entry (pq ,p ,j );
194+ while ( (j = 2 * p + 1 ) < size - 1 ) {
195+ if (j == size - 2 ) {
196+ if (compare_crecid (pq [p ],pq [j ]) > 0 ) swap_entry (pq ,p ,j );
197197 return ;
198198 }
199199 else {
200- if (compare_crecid (pq [j ], pq [j + 1 ]) < 0 ) {
201- if (compare_crecid (pq [p ],pq [j ]) > 0 ) {swap_entry (pq ,p ,j ); p = j ;}
200+ if (compare_crecid (pq [j ], pq [j + 1 ]) < 0 ) {
201+ if (compare_crecid (pq [p ],pq [j ]) > 0 ) {swap_entry (pq ,p ,j ); p = j ;}
202202 else return ;
203203 }
204204 else {
205- if (compare_crecid (pq [p ],pq [j + 1 ]) > 0 ) {swap_entry (pq ,p ,j + 1 ); p = j + 1 ;}
205+ if (compare_crecid (pq [p ],pq [j + 1 ]) > 0 ) {swap_entry (pq ,p ,j + 1 ); p = j + 1 ;}
206206 else return ;
207207 }
208208 }
@@ -211,7 +211,7 @@ void delete(CRECID *pq, int size) {
211211
212212/* Write top node of priority queue to file, accumulating duplicate entries */
213213int merge_write (CRECID new , CRECID * old , FILE * fout ) {
214- if (new .word1 == old -> word1 && new .word2 == old -> word2 ) {
214+ if (new .word1 == old -> word1 && new .word2 == old -> word2 ) {
215215 old -> val += new .val ;
216216 return 0 ; // Indicates duplicate entry
217217 }
@@ -230,13 +230,13 @@ int merge_files(int num) {
230230 fid = malloc (sizeof (FILE ) * num );
231231 pq = malloc (sizeof (CRECID ) * num );
232232 fout = stdout ;
233- if (verbose > 1 ) fprintf (stderr , "Merging cooccurrence files: processed 0 lines." );
233+ if (verbose > 1 ) fprintf (stderr , "Merging cooccurrence files: processed 0 lines." );
234234
235235 /* Open all files and add first entry of each to priority queue */
236- for (i = 0 ; i < num ; i ++ ) {
236+ for (i = 0 ; i < num ; i ++ ) {
237237 sprintf (filename ,"%s_%04d.bin" ,file_head ,i );
238238 fid [i ] = fopen (filename ,"rb" );
239- if (fid [i ] == NULL ) {fprintf (stderr , "Unable to open file %s.\n" ,filename ); return 1 ;}
239+ if (fid [i ] == NULL ) {fprintf (stderr , "Unable to open file %s.\n" ,filename ); return 1 ;}
240240 fread (& new , sizeof (CREC ), 1 , fid [i ]);
241241 new .id = i ;
242242 insert (pq ,new ,i + 1 );
@@ -248,28 +248,28 @@ int merge_files(int num) {
248248 i = pq [0 ].id ;
249249 delete (pq , size );
250250 fread (& new , sizeof (CREC ), 1 , fid [i ]);
251- if (feof (fid [i ])) size -- ;
251+ if (feof (fid [i ])) size -- ;
252252 else {
253253 new .id = i ;
254254 insert (pq , new , size );
255255 }
256256
257257 /* Repeatedly pop top node and fill priority queue until files have reached EOF */
258- while (size > 0 ) {
258+ while (size > 0 ) {
259259 counter += merge_write (pq [0 ], & old , fout ); // Only count the lines written to file, not duplicates
260- if ((counter %100000 ) == 0 ) if (verbose > 1 ) fprintf (stderr ,"\033[39G%lld lines." ,counter );
260+ if ((counter %100000 ) == 0 ) if (verbose > 1 ) fprintf (stderr ,"\033[39G%lld lines." ,counter );
261261 i = pq [0 ].id ;
262262 delete (pq , size );
263263 fread (& new , sizeof (CREC ), 1 , fid [i ]);
264- if (feof (fid [i ])) size -- ;
264+ if (feof (fid [i ])) size -- ;
265265 else {
266266 new .id = i ;
267267 insert (pq , new , size );
268268 }
269269 }
270270 fwrite (& old , sizeof (CREC ), 1 , fout );
271271 fprintf (stderr ,"\033[0GMerging cooccurrence files: processed %lld lines.\n" ,++ counter );
272- for (i = 0 ;i < num ;i ++ ) {
272+ for (i = 0 ;i < num ;i ++ ) {
273273 sprintf (filename ,"%s_%04d.bin" ,file_head ,i );
274274 remove (filename );
275275 }
@@ -289,22 +289,22 @@ int get_cooccurrence() {
289289 history = malloc (sizeof (long long ) * window_size );
290290
291291 fprintf (stderr , "COUNTING COOCCURRENCES\n" );
292- if (verbose > 0 ) {
292+ if (verbose > 0 ) {
293293 fprintf (stderr , "window size: %d\n" , window_size );
294- if (symmetric == 0 ) fprintf (stderr , "context: asymmetric\n" );
294+ if (symmetric == 0 ) fprintf (stderr , "context: asymmetric\n" );
295295 else fprintf (stderr , "context: symmetric\n" );
296296 }
297- if (verbose > 1 ) fprintf (stderr , "max product: %lld\n" , max_product );
298- if (verbose > 1 ) fprintf (stderr , "overflow length: %lld\n" , overflow_length );
297+ if (verbose > 1 ) fprintf (stderr , "max product: %lld\n" , max_product );
298+ if (verbose > 1 ) fprintf (stderr , "overflow length: %lld\n" , overflow_length );
299299 sprintf (format ,"%%%ds %%lld" , MAX_STRING_LENGTH ); // Format to read from vocab file, which has (irrelevant) frequency data
300- if (verbose > 1 ) fprintf (stderr , "Reading vocab from file \"%s\"..." , vocab_file );
300+ if (verbose > 1 ) fprintf (stderr , "Reading vocab from file \"%s\"..." , vocab_file );
301301 fid = fopen (vocab_file ,"r" );
302- if (fid == NULL ) {fprintf (stderr ,"Unable to open vocab file %s.\n" ,vocab_file ); return 1 ;}
303- while (fscanf (fid , format , str , & id ) != EOF ) hashinsert (vocab_hash , str , ++ j ); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
302+ if (fid == NULL ) {fprintf (stderr ,"Unable to open vocab file %s.\n" ,vocab_file ); return 1 ;}
303+ while (fscanf (fid , format , str , & id ) != EOF ) hashinsert (vocab_hash , str , ++ j ); // Here id is not used: inserting vocab words into hash table with their frequency rank, j
304304 fclose (fid );
305305 vocab_size = j ;
306306 j = 0 ;
307- if (verbose > 1 ) fprintf (stderr , "loaded %lld words.\nBuilding lookup table..." , vocab_size );
307+ if (verbose > 1 ) fprintf (stderr , "loaded %lld words.\nBuilding lookup table..." , vocab_size );
308308
309309 /* Build auxiliary lookup table used to index into bigram_table */
310310 lookup = (long long * )calloc ( vocab_size + 1 , sizeof (long long ) );
@@ -313,11 +313,11 @@ int get_cooccurrence() {
313313 return 1 ;
314314 }
315315 lookup [0 ] = 1 ;
316- for (a = 1 ; a <= vocab_size ; a ++ ) {
317- if ((lookup [a ] = max_product / a ) < vocab_size ) lookup [a ] += lookup [a - 1 ];
316+ for (a = 1 ; a <= vocab_size ; a ++ ) {
317+ if ((lookup [a ] = max_product / a ) < vocab_size ) lookup [a ] += lookup [a - 1 ];
318318 else lookup [a ] = lookup [a - 1 ] + vocab_size ;
319319 }
320- if (verbose > 1 ) fprintf (stderr , "table contains %lld elements.\n" ,lookup [a - 1 ]);
320+ if (verbose > 1 ) fprintf (stderr , "table contains %lld elements.\n" ,lookup [a - 1 ]);
321321
322322 /* Allocate memory for full array which will store all cooccurrence counts for words whose product of frequency ranks is less than max_product */
323323 bigram_table = (real * )calloc ( lookup [a - 1 ] , sizeof (real ) );
@@ -330,11 +330,11 @@ int get_cooccurrence() {
330330 sprintf (format ,"%%%ds" ,MAX_STRING_LENGTH );
331331 sprintf (filename ,"%s_%04d.bin" ,file_head , fidcounter );
332332 foverflow = fopen (filename ,"w" );
333- if (verbose > 1 ) fprintf (stderr ,"Processing token: 0" );
333+ if (verbose > 1 ) fprintf (stderr ,"Processing token: 0" );
334334
335335 /* For each token in input stream, calculate a weighted cooccurrence sum within window_size */
336336 while (1 ) {
337- if (ind >= overflow_length - window_size ) { // If overflow buffer is (almost) full, sort it and write it to temporary file
337+ if (ind >= overflow_length - window_size ) { // If overflow buffer is (almost) full, sort it and write it to temporary file
338338 qsort (cr , ind , sizeof (CREC ), compare_crec );
339339 write_chunk (cr ,ind ,foverflow );
340340 fclose (foverflow );
@@ -344,25 +344,25 @@ int get_cooccurrence() {
344344 ind = 0 ;
345345 }
346346 flag = get_word (str , fid );
347- if (feof (fid )) break ;
348- if (flag == 1 ) {j = 0 ; continue ;} // Newline, reset line index (j)
347+ if (feof (fid )) break ;
348+ if (flag == 1 ) {j = 0 ; continue ;} // Newline, reset line index (j)
349349 counter ++ ;
350- if ((counter %100000 ) == 0 ) if (verbose > 1 ) fprintf (stderr ,"\033[19G%lld" ,counter );
350+ if ((counter %100000 ) == 0 ) if (verbose > 1 ) fprintf (stderr ,"\033[19G%lld" ,counter );
351351 htmp = hashsearch (vocab_hash , str );
352352 if (htmp == NULL ) continue ; // Skip out-of-vocabulary words
353353 w2 = htmp -> id ; // Target word (frequency rank)
354- for (k = j - 1 ; k >= ( (j > window_size ) ? j - window_size : 0 ); k -- ) { // Iterate over all words to the left of target word, but not past beginning of line
354+ for (k = j - 1 ; k >= ( (j > window_size ) ? j - window_size : 0 ); k -- ) { // Iterate over all words to the left of target word, but not past beginning of line
355355 w1 = history [k % window_size ]; // Context word (frequency rank)
356356 if ( w1 < max_product /w2 ) { // Product is small enough to store in a full array
357357 bigram_table [lookup [w1 - 1 ] + w2 - 2 ] += 1.0 /((real )(j - k )); // Weight by inverse of distance between words
358- if (symmetric > 0 ) bigram_table [lookup [w2 - 1 ] + w1 - 2 ] += 1.0 /((real )(j - k )); // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
358+ if (symmetric > 0 ) bigram_table [lookup [w2 - 1 ] + w1 - 2 ] += 1.0 /((real )(j - k )); // If symmetric context is used, exchange roles of w2 and w1 (ie look at right context too)
359359 }
360360 else { // Product is too big, data is likely to be sparse. Store these entries in a temporary buffer to be sorted, merged (accumulated), and written to file when it gets full.
361361 cr [ind ].word1 = w1 ;
362362 cr [ind ].word2 = w2 ;
363363 cr [ind ].val = 1.0 /((real )(j - k ));
364364 ind ++ ; // Keep track of how full temporary buffer is
365- if (symmetric > 0 ) { // Symmetric context
365+ if (symmetric > 0 ) { // Symmetric context
366366 cr [ind ].word1 = w2 ;
367367 cr [ind ].word2 = w1 ;
368368 cr [ind ].val = 1.0 /((real )(j - k ));
@@ -375,27 +375,27 @@ int get_cooccurrence() {
375375 }
376376
377377 /* Write out temp buffer for the final time (it may not be full) */
378- if (verbose > 1 ) fprintf (stderr ,"\033[0GProcessed %lld tokens.\n" ,counter );
378+ if (verbose > 1 ) fprintf (stderr ,"\033[0GProcessed %lld tokens.\n" ,counter );
379379 qsort (cr , ind , sizeof (CREC ), compare_crec );
380380 write_chunk (cr ,ind ,foverflow );
381381 sprintf (filename ,"%s_0000.bin" ,file_head );
382382
383383 /* Write out full bigram_table, skipping zeros */
384- if (verbose > 1 ) fprintf (stderr , "Writing cooccurrences to disk" );
384+ if (verbose > 1 ) fprintf (stderr , "Writing cooccurrences to disk" );
385385 fid = fopen (filename ,"w" );
386386 j = 1e6 ;
387- for (x = 1 ; x <= vocab_size ; x ++ ) {
388- if ( (long long ) (0.75 * log (vocab_size / x )) < j ) {j = (long long ) (0.75 * log (vocab_size / x )); if (verbose > 1 ) fprintf (stderr ,"." );} // log's to make it look (sort of) pretty
389- for (y = 1 ; y <= (lookup [x ] - lookup [x - 1 ]); y ++ ) {
390- if ((r = bigram_table [lookup [x - 1 ] - 2 + y ]) != 0 ) {
387+ for (x = 1 ; x <= vocab_size ; x ++ ) {
388+ if ( (long long ) (0.75 * log (vocab_size / x )) < j ) {j = (long long ) (0.75 * log (vocab_size / x )); if (verbose > 1 ) fprintf (stderr ,"." );} // log's to make it look (sort of) pretty
389+ for (y = 1 ; y <= (lookup [x ] - lookup [x - 1 ]); y ++ ) {
390+ if ((r = bigram_table [lookup [x - 1 ] - 2 + y ]) != 0 ) {
391391 fwrite (& x , sizeof (int ), 1 , fid );
392392 fwrite (& y , sizeof (int ), 1 , fid );
393393 fwrite (& r , sizeof (real ), 1 , fid );
394394 }
395395 }
396396 }
397397
398- if (verbose > 1 ) fprintf (stderr ,"%d files in total.\n" ,fidcounter + 1 );
398+ if (verbose > 1 ) fprintf (stderr ,"%d files in total.\n" ,fidcounter + 1 );
399399 fclose (fid );
400400 fclose (foverflow );
401401 free (cr );
@@ -408,7 +408,7 @@ int get_cooccurrence() {
408408int find_arg (char * str , int argc , char * * argv ) {
409409 int i ;
410410 for (i = 1 ; i < argc ; i ++ ) {
411- if (!scmp (str , argv [i ])) {
411+ if (!scmp (str , argv [i ])) {
412412 if (i == argc - 1 ) {
413413 printf ("No argument given for %s\n" , str );
414414 exit (1 );
@@ -463,7 +463,7 @@ int main(int argc, char **argv) {
463463 /* The memory_limit determines a limit on the number of elements in bigram_table and the overflow buffer */
464464 /* Estimate the maximum value that max_product can take so that this limit is still satisfied */
465465 rlimit = 0.85 * (real )memory_limit * 1073741824 /(sizeof (CREC ));
466- while (fabs (rlimit - n * (log (n ) + 0.1544313298 )) > 1e-3 ) n = rlimit / (log (n ) + 0.1544313298 );
466+ while (fabs (rlimit - n * (log (n ) + 0.1544313298 )) > 1e-3 ) n = rlimit / (log (n ) + 0.1544313298 );
467467 max_product = (long long ) n ;
468468 overflow_length = (long long ) rlimit /6 ; // 0.85 + 1/6 ~= 1
469469
0 commit comments