29
29
#define XDL_GUESS_NLINES1 256
30
30
#define XDL_GUESS_NLINES2 20
31
31
32
+ #define DISCARD 0
33
+ #define KEEP 1
34
+ #define INVESTIGATE 2
32
35
33
36
typedef struct s_xdlclass {
34
37
struct s_xdlclass * next ;
@@ -190,15 +193,15 @@ void xdl_free_env(xdfenv_t *xe) {
190
193
}
191
194
192
195
193
- static int xdl_clean_mmatch (char const * dis , long i , long s , long e ) {
196
+ static bool xdl_clean_mmatch (uint8_t const * action , long i , long s , long e ) {
194
197
long r , rdis0 , rpdis0 , rdis1 , rpdis1 ;
195
198
196
199
/*
197
- * Limits the window the is examined during the similar-lines
198
- * scan. The loops below stops when dis [i - r] == 1 (line that
199
- * has no match), but there are corner cases where the loop
200
- * proceed all the way to the extremities by causing huge
201
- * performance penalties in case of big files.
200
+ * Limits the window that is examined during the similar-lines
201
+ * scan. The loops below stops when action [i - r] == KEEP
202
+ * (line that has no match), but there are corner cases where
203
+ * the loop proceed all the way to the extremities by causing
204
+ * huge performance penalties in case of big files.
202
205
*/
203
206
if (i - s > XDL_SIMSCAN_WINDOW )
204
207
s = i - XDL_SIMSCAN_WINDOW ;
@@ -207,40 +210,47 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
207
210
208
211
/*
209
212
* Scans the lines before 'i' to find a run of lines that either
210
- * have no match (dis[j] == 0) or have multiple matches (dis[j] > 1).
211
- * Note that we always call this function with dis[i] > 1, so the
212
- * current line (i) is already a multimatch line.
213
+ * have no match (action[j] == DISCARD) or have multiple matches
214
+ * (action[j] == INVESTIGATE). Note that we always call this
215
+ * function with action[i] == INVESTIGATE, so the current line
216
+ * (i) is already a multimatch line.
213
217
*/
214
218
for (r = 1 , rdis0 = 0 , rpdis0 = 1 ; (i - r ) >= s ; r ++ ) {
215
- if (! dis [i - r ])
219
+ if (action [i - r ] == DISCARD )
216
220
rdis0 ++ ;
217
- else if (dis [i - r ] == 2 )
221
+ else if (action [i - r ] == INVESTIGATE )
218
222
rpdis0 ++ ;
219
- else
223
+ else if ( action [ i - r ] == KEEP )
220
224
break ;
225
+ else
226
+ BUG ("Illegal value for action[i - r]" );
221
227
}
222
228
/*
223
- * If the run before the line 'i' found only multimatch lines, we
224
- * return 0 and hence we don't make the current line (i) discarded.
225
- * We want to discard multimatch lines only when they appear in the
226
- * middle of runs with nomatch lines (dis[j] == 0).
229
+ * If the run before the line 'i' found only multimatch lines,
230
+ * we return false and hence we don't make the current line (i)
231
+ * discarded. We want to discard multimatch lines only when
232
+ * they appear in the middle of runs with nomatch lines
233
+ * (action[j] == DISCARD).
227
234
*/
228
235
if (rdis0 == 0 )
229
236
return 0 ;
230
237
for (r = 1 , rdis1 = 0 , rpdis1 = 1 ; (i + r ) <= e ; r ++ ) {
231
- if (! dis [i + r ])
238
+ if (action [i + r ] == DISCARD )
232
239
rdis1 ++ ;
233
- else if (dis [i + r ] == 2 )
240
+ else if (action [i + r ] == INVESTIGATE )
234
241
rpdis1 ++ ;
235
- else
242
+ else if ( action [ i + r ] == KEEP )
236
243
break ;
244
+ else
245
+ BUG ("Illegal value for action[i + r]" );
237
246
}
238
247
/*
239
- * If the run after the line 'i' found only multimatch lines, we
240
- * return 0 and hence we don't make the current line (i) discarded.
248
+ * If the run after the line 'i' found only multimatch lines,
249
+ * we return false and hence we don't make the current line (i)
250
+ * discarded.
241
251
*/
242
252
if (rdis1 == 0 )
243
- return 0 ;
253
+ return false ;
244
254
rdis1 += rdis0 ;
245
255
rpdis1 += rpdis0 ;
246
256
@@ -251,59 +261,81 @@ static int xdl_clean_mmatch(char const *dis, long i, long s, long e) {
251
261
/*
252
262
* Try to reduce the problem complexity, discard records that have no
253
263
* matches on the other file. Also, lines that have multiple matches
254
- * might be potentially discarded if they happear in a run of discardable.
264
+ * might be potentially discarded if they appear in a run of discardable.
255
265
*/
256
266
static int xdl_cleanup_records (xdlclassifier_t * cf , xdfile_t * xdf1 , xdfile_t * xdf2 ) {
257
267
long i , nm , nreff , mlim ;
258
268
xrecord_t * recs ;
259
269
xdlclass_t * rcrec ;
260
- char * dis , * dis1 , * dis2 ;
261
- int need_min = !!(cf -> flags & XDF_NEED_MINIMAL );
270
+ uint8_t * action1 = NULL , * action2 = NULL ;
271
+ bool need_min = !!(cf -> flags & XDF_NEED_MINIMAL );
272
+ int ret = 0 ;
262
273
263
- if (!XDL_CALLOC_ARRAY (dis , xdf1 -> nrec + xdf2 -> nrec + 2 ))
264
- return -1 ;
265
- dis1 = dis ;
266
- dis2 = dis1 + xdf1 -> nrec + 1 ;
274
+ /*
275
+ * Create temporary arrays that will help us decide if
276
+ * changed[i] should remain 0 or become 1.
277
+ */
278
+ if (!XDL_CALLOC_ARRAY (action1 , xdf1 -> nrec + 1 )) {
279
+ ret = -1 ;
280
+ goto cleanup ;
281
+ }
282
+ if (!XDL_CALLOC_ARRAY (action2 , xdf2 -> nrec + 1 )) {
283
+ ret = -1 ;
284
+ goto cleanup ;
285
+ }
267
286
287
+ /*
288
+ * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
289
+ */
268
290
if ((mlim = xdl_bogosqrt (xdf1 -> nrec )) > XDL_MAX_EQLIMIT )
269
291
mlim = XDL_MAX_EQLIMIT ;
270
292
for (i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ]; i <= xdf1 -> dend ; i ++ , recs ++ ) {
271
293
rcrec = cf -> rcrecs [recs -> ha ];
272
294
nm = rcrec ? rcrec -> len2 : 0 ;
273
- dis1 [i ] = (nm == 0 ) ? 0 : (nm >= mlim && !need_min ) ? 2 : 1 ;
295
+ action1 [i ] = (nm == 0 ) ? DISCARD : (nm >= mlim && !need_min ) ? INVESTIGATE : KEEP ;
274
296
}
275
297
276
298
if ((mlim = xdl_bogosqrt (xdf2 -> nrec )) > XDL_MAX_EQLIMIT )
277
299
mlim = XDL_MAX_EQLIMIT ;
278
300
for (i = xdf2 -> dstart , recs = & xdf2 -> recs [xdf2 -> dstart ]; i <= xdf2 -> dend ; i ++ , recs ++ ) {
279
301
rcrec = cf -> rcrecs [recs -> ha ];
280
302
nm = rcrec ? rcrec -> len1 : 0 ;
281
- dis2 [i ] = (nm == 0 ) ? 0 : (nm >= mlim && !need_min ) ? 2 : 1 ;
303
+ action2 [i ] = (nm == 0 ) ? DISCARD : (nm >= mlim && !need_min ) ? INVESTIGATE : KEEP ;
282
304
}
283
305
306
+ /*
307
+ * Use temporary arrays to decide if changed[i] should remain
308
+ * 0 or become 1.
309
+ */
284
310
for (nreff = 0 , i = xdf1 -> dstart , recs = & xdf1 -> recs [xdf1 -> dstart ];
285
311
i <= xdf1 -> dend ; i ++ , recs ++ ) {
286
- if (dis1 [i ] == 1 ||
287
- (dis1 [i ] == 2 && !xdl_clean_mmatch (dis1 , i , xdf1 -> dstart , xdf1 -> dend ))) {
312
+ if (action1 [i ] == KEEP ||
313
+ (action1 [i ] == INVESTIGATE && !xdl_clean_mmatch (action1 , i , xdf1 -> dstart , xdf1 -> dend ))) {
288
314
xdf1 -> rindex [nreff ++ ] = i ;
315
+ /* changed[i] remains 0, i.e. keep */
289
316
} else
290
317
xdf1 -> changed [i ] = 1 ;
318
+ /* i.e. discard */
291
319
}
292
320
xdf1 -> nreff = nreff ;
293
321
294
322
for (nreff = 0 , i = xdf2 -> dstart , recs = & xdf2 -> recs [xdf2 -> dstart ];
295
323
i <= xdf2 -> dend ; i ++ , recs ++ ) {
296
- if (dis2 [i ] == 1 ||
297
- (dis2 [i ] == 2 && !xdl_clean_mmatch (dis2 , i , xdf2 -> dstart , xdf2 -> dend ))) {
324
+ if (action2 [i ] == KEEP ||
325
+ (action2 [i ] == INVESTIGATE && !xdl_clean_mmatch (action2 , i , xdf2 -> dstart , xdf2 -> dend ))) {
298
326
xdf2 -> rindex [nreff ++ ] = i ;
327
+ /* changed[i] remains 0, i.e. keep */
299
328
} else
300
329
xdf2 -> changed [i ] = 1 ;
330
+ /* i.e. discard */
301
331
}
302
332
xdf2 -> nreff = nreff ;
303
333
304
- xdl_free (dis );
334
+ cleanup :
335
+ xdl_free (action1 );
336
+ xdl_free (action2 );
305
337
306
- return 0 ;
338
+ return ret ;
307
339
}
308
340
309
341
0 commit comments