Git fork
1/*
2 * LibXDiff by Davide Libenzi ( File Differential Library )
3 * Copyright (C) 2003 Davide Libenzi
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, see
17 * <http://www.gnu.org/licenses/>.
18 *
19 * Davide Libenzi <davidel@xmailserver.org>
20 *
21 */
22
23#include "xinclude.h"
24
25
26#define XDL_KPDIS_RUN 4
27#define XDL_MAX_EQLIMIT 1024
28#define XDL_SIMSCAN_WINDOW 100
29#define XDL_GUESS_NLINES1 256
30#define XDL_GUESS_NLINES2 20
31
32#define DISCARD 0
33#define KEEP 1
34#define INVESTIGATE 2
35
36typedef struct s_xdlclass {
37 struct s_xdlclass *next;
38 xrecord_t rec;
39 long idx;
40 long len1, len2;
41} xdlclass_t;
42
43typedef struct s_xdlclassifier {
44 unsigned int hbits;
45 long hsize;
46 xdlclass_t **rchash;
47 chastore_t ncha;
48 xdlclass_t **rcrecs;
49 long alloc;
50 long count;
51 long flags;
52} xdlclassifier_t;
53
54
55
56
57static int xdl_init_classifier(xdlclassifier_t *cf, long size, long flags) {
58 cf->flags = flags;
59
60 cf->hbits = xdl_hashbits((unsigned int) size);
61 cf->hsize = 1 << cf->hbits;
62
63 if (xdl_cha_init(&cf->ncha, sizeof(xdlclass_t), size / 4 + 1) < 0) {
64
65 return -1;
66 }
67 if (!XDL_CALLOC_ARRAY(cf->rchash, cf->hsize)) {
68
69 xdl_cha_free(&cf->ncha);
70 return -1;
71 }
72
73 cf->alloc = size;
74 if (!XDL_ALLOC_ARRAY(cf->rcrecs, cf->alloc)) {
75
76 xdl_free(cf->rchash);
77 xdl_cha_free(&cf->ncha);
78 return -1;
79 }
80
81 cf->count = 0;
82
83 return 0;
84}
85
86
87static void xdl_free_classifier(xdlclassifier_t *cf) {
88
89 xdl_free(cf->rcrecs);
90 xdl_free(cf->rchash);
91 xdl_cha_free(&cf->ncha);
92}
93
94
95static int xdl_classify_record(unsigned int pass, xdlclassifier_t *cf, xrecord_t *rec) {
96 long hi;
97 xdlclass_t *rcrec;
98
99 hi = (long) XDL_HASHLONG(rec->ha, cf->hbits);
100 for (rcrec = cf->rchash[hi]; rcrec; rcrec = rcrec->next)
101 if (rcrec->rec.ha == rec->ha &&
102 xdl_recmatch(rcrec->rec.ptr, rcrec->rec.size,
103 rec->ptr, rec->size, cf->flags))
104 break;
105
106 if (!rcrec) {
107 if (!(rcrec = xdl_cha_alloc(&cf->ncha))) {
108
109 return -1;
110 }
111 rcrec->idx = cf->count++;
112 if (XDL_ALLOC_GROW(cf->rcrecs, cf->count, cf->alloc))
113 return -1;
114 cf->rcrecs[rcrec->idx] = rcrec;
115 rcrec->rec = *rec;
116 rcrec->len1 = rcrec->len2 = 0;
117 rcrec->next = cf->rchash[hi];
118 cf->rchash[hi] = rcrec;
119 }
120
121 (pass == 1) ? rcrec->len1++ : rcrec->len2++;
122
123 rec->ha = (unsigned long) rcrec->idx;
124
125 return 0;
126}
127
128
129static void xdl_free_ctx(xdfile_t *xdf)
130{
131 xdl_free(xdf->rindex);
132 xdl_free(xdf->changed - 1);
133 xdl_free(xdf->recs);
134}
135
136
137static int xdl_prepare_ctx(unsigned int pass, mmfile_t *mf, long narec, xpparam_t const *xpp,
138 xdlclassifier_t *cf, xdfile_t *xdf) {
139 long bsize;
140 unsigned long hav;
141 char const *blk, *cur, *top, *prev;
142 xrecord_t *crec;
143
144 xdf->rindex = NULL;
145 xdf->changed = NULL;
146 xdf->recs = NULL;
147
148 if (!XDL_ALLOC_ARRAY(xdf->recs, narec))
149 goto abort;
150
151 xdf->nrec = 0;
152 if ((cur = blk = xdl_mmfile_first(mf, &bsize))) {
153 for (top = blk + bsize; cur < top; ) {
154 prev = cur;
155 hav = xdl_hash_record(&cur, top, xpp->flags);
156 if (XDL_ALLOC_GROW(xdf->recs, xdf->nrec + 1, narec))
157 goto abort;
158 crec = &xdf->recs[xdf->nrec++];
159 crec->ptr = prev;
160 crec->size = (long) (cur - prev);
161 crec->ha = hav;
162 if (xdl_classify_record(pass, cf, crec) < 0)
163 goto abort;
164 }
165 }
166
167 if (!XDL_CALLOC_ARRAY(xdf->changed, xdf->nrec + 2))
168 goto abort;
169
170 if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
171 (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF)) {
172 if (!XDL_ALLOC_ARRAY(xdf->rindex, xdf->nrec + 1))
173 goto abort;
174 }
175
176 xdf->changed += 1;
177 xdf->nreff = 0;
178 xdf->dstart = 0;
179 xdf->dend = xdf->nrec - 1;
180
181 return 0;
182
183abort:
184 xdl_free_ctx(xdf);
185 return -1;
186}
187
188
189void xdl_free_env(xdfenv_t *xe) {
190
191 xdl_free_ctx(&xe->xdf2);
192 xdl_free_ctx(&xe->xdf1);
193}
194
195
196static bool xdl_clean_mmatch(uint8_t const *action, long i, long s, long e) {
197 long r, rdis0, rpdis0, rdis1, rpdis1;
198
199 /*
200 * Limits the window that is examined during the similar-lines
201 * scan. The loops below stops when action[i - r] == KEEP
202 * (line that has no match), but there are corner cases where
203 * the loop proceed all the way to the extremities by causing
204 * huge performance penalties in case of big files.
205 */
206 if (i - s > XDL_SIMSCAN_WINDOW)
207 s = i - XDL_SIMSCAN_WINDOW;
208 if (e - i > XDL_SIMSCAN_WINDOW)
209 e = i + XDL_SIMSCAN_WINDOW;
210
211 /*
212 * Scans the lines before 'i' to find a run of lines that either
213 * have no match (action[j] == DISCARD) or have multiple matches
214 * (action[j] == INVESTIGATE). Note that we always call this
215 * function with action[i] == INVESTIGATE, so the current line
216 * (i) is already a multimatch line.
217 */
218 for (r = 1, rdis0 = 0, rpdis0 = 1; (i - r) >= s; r++) {
219 if (action[i - r] == DISCARD)
220 rdis0++;
221 else if (action[i - r] == INVESTIGATE)
222 rpdis0++;
223 else if (action[i - r] == KEEP)
224 break;
225 else
226 BUG("Illegal value for action[i - r]");
227 }
228 /*
229 * If the run before the line 'i' found only multimatch lines,
230 * we return false and hence we don't make the current line (i)
231 * discarded. We want to discard multimatch lines only when
232 * they appear in the middle of runs with nomatch lines
233 * (action[j] == DISCARD).
234 */
235 if (rdis0 == 0)
236 return 0;
237 for (r = 1, rdis1 = 0, rpdis1 = 1; (i + r) <= e; r++) {
238 if (action[i + r] == DISCARD)
239 rdis1++;
240 else if (action[i + r] == INVESTIGATE)
241 rpdis1++;
242 else if (action[i + r] == KEEP)
243 break;
244 else
245 BUG("Illegal value for action[i + r]");
246 }
247 /*
248 * If the run after the line 'i' found only multimatch lines,
249 * we return false and hence we don't make the current line (i)
250 * discarded.
251 */
252 if (rdis1 == 0)
253 return false;
254 rdis1 += rdis0;
255 rpdis1 += rpdis0;
256
257 return rpdis1 * XDL_KPDIS_RUN < (rpdis1 + rdis1);
258}
259
260
261/*
262 * Try to reduce the problem complexity, discard records that have no
263 * matches on the other file. Also, lines that have multiple matches
264 * might be potentially discarded if they appear in a run of discardable.
265 */
266static int xdl_cleanup_records(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
267 long i, nm, nreff, mlim;
268 xrecord_t *recs;
269 xdlclass_t *rcrec;
270 uint8_t *action1 = NULL, *action2 = NULL;
271 bool need_min = !!(cf->flags & XDF_NEED_MINIMAL);
272 int ret = 0;
273
274 /*
275 * Create temporary arrays that will help us decide if
276 * changed[i] should remain false, or become true.
277 */
278 if (!XDL_CALLOC_ARRAY(action1, xdf1->nrec + 1)) {
279 ret = -1;
280 goto cleanup;
281 }
282 if (!XDL_CALLOC_ARRAY(action2, xdf2->nrec + 1)) {
283 ret = -1;
284 goto cleanup;
285 }
286
287 /*
288 * Initialize temporary arrays with DISCARD, KEEP, or INVESTIGATE.
289 */
290 if ((mlim = xdl_bogosqrt(xdf1->nrec)) > XDL_MAX_EQLIMIT)
291 mlim = XDL_MAX_EQLIMIT;
292 for (i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart]; i <= xdf1->dend; i++, recs++) {
293 rcrec = cf->rcrecs[recs->ha];
294 nm = rcrec ? rcrec->len2 : 0;
295 action1[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
296 }
297
298 if ((mlim = xdl_bogosqrt(xdf2->nrec)) > XDL_MAX_EQLIMIT)
299 mlim = XDL_MAX_EQLIMIT;
300 for (i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart]; i <= xdf2->dend; i++, recs++) {
301 rcrec = cf->rcrecs[recs->ha];
302 nm = rcrec ? rcrec->len1 : 0;
303 action2[i] = (nm == 0) ? DISCARD: (nm >= mlim && !need_min) ? INVESTIGATE: KEEP;
304 }
305
306 /*
307 * Use temporary arrays to decide if changed[i] should remain
308 * false, or become true.
309 */
310 for (nreff = 0, i = xdf1->dstart, recs = &xdf1->recs[xdf1->dstart];
311 i <= xdf1->dend; i++, recs++) {
312 if (action1[i] == KEEP ||
313 (action1[i] == INVESTIGATE && !xdl_clean_mmatch(action1, i, xdf1->dstart, xdf1->dend))) {
314 xdf1->rindex[nreff++] = i;
315 /* changed[i] remains false, i.e. keep */
316 } else
317 xdf1->changed[i] = true;
318 /* i.e. discard */
319 }
320 xdf1->nreff = nreff;
321
322 for (nreff = 0, i = xdf2->dstart, recs = &xdf2->recs[xdf2->dstart];
323 i <= xdf2->dend; i++, recs++) {
324 if (action2[i] == KEEP ||
325 (action2[i] == INVESTIGATE && !xdl_clean_mmatch(action2, i, xdf2->dstart, xdf2->dend))) {
326 xdf2->rindex[nreff++] = i;
327 /* changed[i] remains false, i.e. keep */
328 } else
329 xdf2->changed[i] = true;
330 /* i.e. discard */
331 }
332 xdf2->nreff = nreff;
333
334cleanup:
335 xdl_free(action1);
336 xdl_free(action2);
337
338 return ret;
339}
340
341
342/*
343 * Early trim initial and terminal matching records.
344 */
345static int xdl_trim_ends(xdfile_t *xdf1, xdfile_t *xdf2) {
346 long i, lim;
347 xrecord_t *recs1, *recs2;
348
349 recs1 = xdf1->recs;
350 recs2 = xdf2->recs;
351 for (i = 0, lim = XDL_MIN(xdf1->nrec, xdf2->nrec); i < lim;
352 i++, recs1++, recs2++)
353 if (recs1->ha != recs2->ha)
354 break;
355
356 xdf1->dstart = xdf2->dstart = i;
357
358 recs1 = xdf1->recs + xdf1->nrec - 1;
359 recs2 = xdf2->recs + xdf2->nrec - 1;
360 for (lim -= i, i = 0; i < lim; i++, recs1--, recs2--)
361 if (recs1->ha != recs2->ha)
362 break;
363
364 xdf1->dend = xdf1->nrec - i - 1;
365 xdf2->dend = xdf2->nrec - i - 1;
366
367 return 0;
368}
369
370
371static int xdl_optimize_ctxs(xdlclassifier_t *cf, xdfile_t *xdf1, xdfile_t *xdf2) {
372
373 if (xdl_trim_ends(xdf1, xdf2) < 0 ||
374 xdl_cleanup_records(cf, xdf1, xdf2) < 0) {
375
376 return -1;
377 }
378
379 return 0;
380}
381
382int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
383 xdfenv_t *xe) {
384 long enl1, enl2, sample;
385 xdlclassifier_t cf;
386
387 memset(&cf, 0, sizeof(cf));
388
389 /*
390 * For histogram diff, we can afford a smaller sample size and
391 * thus a poorer estimate of the number of lines, as the hash
392 * table (rhash) won't be filled up/grown. The number of lines
393 * (nrecs) will be updated correctly anyway by
394 * xdl_prepare_ctx().
395 */
396 sample = (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF
397 ? XDL_GUESS_NLINES2 : XDL_GUESS_NLINES1);
398
399 enl1 = xdl_guess_lines(mf1, sample) + 1;
400 enl2 = xdl_guess_lines(mf2, sample) + 1;
401
402 if (xdl_init_classifier(&cf, enl1 + enl2 + 1, xpp->flags) < 0)
403 return -1;
404
405 if (xdl_prepare_ctx(1, mf1, enl1, xpp, &cf, &xe->xdf1) < 0) {
406
407 xdl_free_classifier(&cf);
408 return -1;
409 }
410 if (xdl_prepare_ctx(2, mf2, enl2, xpp, &cf, &xe->xdf2) < 0) {
411
412 xdl_free_ctx(&xe->xdf1);
413 xdl_free_classifier(&cf);
414 return -1;
415 }
416
417 if ((XDF_DIFF_ALG(xpp->flags) != XDF_PATIENCE_DIFF) &&
418 (XDF_DIFF_ALG(xpp->flags) != XDF_HISTOGRAM_DIFF) &&
419 xdl_optimize_ctxs(&cf, &xe->xdf1, &xe->xdf2) < 0) {
420
421 xdl_free_ctx(&xe->xdf2);
422 xdl_free_ctx(&xe->xdf1);
423 xdl_free_classifier(&cf);
424 return -1;
425 }
426
427 xdl_free_classifier(&cf);
428
429 return 0;
430}