xdiff/xdiffi.c at reftables-rust · freshlybakedca.ke/git

freshlybakedca.ke / git
fork atom
Git fork
fork atom
git / xdiff / xdiffi.c
at reftables-rust 1081 lines 28 kB view raw
wrap content
Ezekiel Newren xdiff: change type of xdfile_t.changed from char to bool 5mo ago
8b9c5d2e
   1/*
   2 *  LibXDiff by Davide Libenzi ( File Differential Library )
   3 *  Copyright (C) 2003	Davide Libenzi
   4 *
   5 *  This library is free software; you can redistribute it and/or
   6 *  modify it under the terms of the GNU Lesser General Public
   7 *  License as published by the Free Software Foundation; either
   8 *  version 2.1 of the License, or (at your option) any later version.
   9 *
  10 *  This library is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 *  Lesser General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU Lesser General Public
  16 *  License along with this library; if not, see
  17 *  <http://www.gnu.org/licenses/>.
  18 *
  19 *  Davide Libenzi <davidel@xmailserver.org>
  20 *
  21 */
  22
  23#include "xinclude.h"
  24
  25static unsigned long get_hash(xdfile_t *xdf, long index)
  26{
  27	return xdf->recs[xdf->rindex[index]].ha;
  28}
  29
  30#define XDL_MAX_COST_MIN 256
  31#define XDL_HEUR_MIN_COST 256
  32#define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
  33#define XDL_SNAKE_CNT 20
  34#define XDL_K_HEUR 4
  35
  36typedef struct s_xdpsplit {
  37	long i1, i2;
  38	int min_lo, min_hi;
  39} xdpsplit_t;
  40
  41/*
  42 * See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
  43 * Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
  44 * the forward diagonal starting from (off1, off2) and the backward diagonal
  45 * starting from (lim1, lim2). If the K values on the same diagonal crosses
  46 * returns the furthest point of reach. We might encounter expensive edge cases
  47 * using this algorithm, so a little bit of heuristic is needed to cut the
  48 * search and to return a suboptimal point.
  49 */
  50static long xdl_split(xdfile_t *xdf1, long off1, long lim1,
  51		      xdfile_t *xdf2, long off2, long lim2,
  52		      long *kvdf, long *kvdb, int need_min, xdpsplit_t *spl,
  53		      xdalgoenv_t *xenv) {
  54	long dmin = off1 - lim2, dmax = lim1 - off2;
  55	long fmid = off1 - off2, bmid = lim1 - lim2;
  56	long odd = (fmid - bmid) & 1;
  57	long fmin = fmid, fmax = fmid;
  58	long bmin = bmid, bmax = bmid;
  59	long ec, d, i1, i2, prev1, best, dd, v, k;
  60
  61	/*
  62	 * Set initial diagonal values for both forward and backward path.
  63	 */
  64	kvdf[fmid] = off1;
  65	kvdb[bmid] = lim1;
  66
  67	for (ec = 1;; ec++) {
  68		int got_snake = 0;
  69
  70		/*
  71		 * We need to extend the diagonal "domain" by one. If the next
  72		 * values exits the box boundaries we need to change it in the
  73		 * opposite direction because (max - min) must be a power of
  74		 * two.
  75		 *
  76		 * Also we initialize the external K value to -1 so that we can
  77		 * avoid extra conditions in the check inside the core loop.
  78		 */
  79		if (fmin > dmin)
  80			kvdf[--fmin - 1] = -1;
  81		else
  82			++fmin;
  83		if (fmax < dmax)
  84			kvdf[++fmax + 1] = -1;
  85		else
  86			--fmax;
  87
  88		for (d = fmax; d >= fmin; d -= 2) {
  89			if (kvdf[d - 1] >= kvdf[d + 1])
  90				i1 = kvdf[d - 1] + 1;
  91			else
  92				i1 = kvdf[d + 1];
  93			prev1 = i1;
  94			i2 = i1 - d;
  95			for (; i1 < lim1 && i2 < lim2 && get_hash(xdf1, i1) == get_hash(xdf2, i2); i1++, i2++);
  96			if (i1 - prev1 > xenv->snake_cnt)
  97				got_snake = 1;
  98			kvdf[d] = i1;
  99			if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
 100				spl->i1 = i1;
 101				spl->i2 = i2;
 102				spl->min_lo = spl->min_hi = 1;
 103				return ec;
 104			}
 105		}
 106
 107		/*
 108		 * We need to extend the diagonal "domain" by one. If the next
 109		 * values exits the box boundaries we need to change it in the
 110		 * opposite direction because (max - min) must be a power of
 111		 * two.
 112		 *
 113		 * Also we initialize the external K value to -1 so that we can
 114		 * avoid extra conditions in the check inside the core loop.
 115		 */
 116		if (bmin > dmin)
 117			kvdb[--bmin - 1] = XDL_LINE_MAX;
 118		else
 119			++bmin;
 120		if (bmax < dmax)
 121			kvdb[++bmax + 1] = XDL_LINE_MAX;
 122		else
 123			--bmax;
 124
 125		for (d = bmax; d >= bmin; d -= 2) {
 126			if (kvdb[d - 1] < kvdb[d + 1])
 127				i1 = kvdb[d - 1];
 128			else
 129				i1 = kvdb[d + 1] - 1;
 130			prev1 = i1;
 131			i2 = i1 - d;
 132			for (; i1 > off1 && i2 > off2 && get_hash(xdf1, i1 - 1) == get_hash(xdf2, i2 - 1); i1--, i2--);
 133			if (prev1 - i1 > xenv->snake_cnt)
 134				got_snake = 1;
 135			kvdb[d] = i1;
 136			if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
 137				spl->i1 = i1;
 138				spl->i2 = i2;
 139				spl->min_lo = spl->min_hi = 1;
 140				return ec;
 141			}
 142		}
 143
 144		if (need_min)
 145			continue;
 146
 147		/*
 148		 * If the edit cost is above the heuristic trigger and if
 149		 * we got a good snake, we sample current diagonals to see
 150		 * if some of them have reached an "interesting" path. Our
 151		 * measure is a function of the distance from the diagonal
 152		 * corner (i1 + i2) penalized with the distance from the
 153		 * mid diagonal itself. If this value is above the current
 154		 * edit cost times a magic factor (XDL_K_HEUR) we consider
 155		 * it interesting.
 156		 */
 157		if (got_snake && ec > xenv->heur_min) {
 158			for (best = 0, d = fmax; d >= fmin; d -= 2) {
 159				dd = d > fmid ? d - fmid: fmid - d;
 160				i1 = kvdf[d];
 161				i2 = i1 - d;
 162				v = (i1 - off1) + (i2 - off2) - dd;
 163
 164				if (v > XDL_K_HEUR * ec && v > best &&
 165				    off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
 166				    off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
 167					for (k = 1; get_hash(xdf1, i1 - k) == get_hash(xdf2, i2 - k); k++)
 168						if (k == xenv->snake_cnt) {
 169							best = v;
 170							spl->i1 = i1;
 171							spl->i2 = i2;
 172							break;
 173						}
 174				}
 175			}
 176			if (best > 0) {
 177				spl->min_lo = 1;
 178				spl->min_hi = 0;
 179				return ec;
 180			}
 181
 182			for (best = 0, d = bmax; d >= bmin; d -= 2) {
 183				dd = d > bmid ? d - bmid: bmid - d;
 184				i1 = kvdb[d];
 185				i2 = i1 - d;
 186				v = (lim1 - i1) + (lim2 - i2) - dd;
 187
 188				if (v > XDL_K_HEUR * ec && v > best &&
 189				    off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
 190				    off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
 191					for (k = 0; get_hash(xdf1, i1 + k) == get_hash(xdf2, i2 + k); k++)
 192						if (k == xenv->snake_cnt - 1) {
 193							best = v;
 194							spl->i1 = i1;
 195							spl->i2 = i2;
 196							break;
 197						}
 198				}
 199			}
 200			if (best > 0) {
 201				spl->min_lo = 0;
 202				spl->min_hi = 1;
 203				return ec;
 204			}
 205		}
 206
 207		/*
 208		 * Enough is enough. We spent too much time here and now we
 209		 * collect the furthest reaching path using the (i1 + i2)
 210		 * measure.
 211		 */
 212		if (ec >= xenv->mxcost) {
 213			long fbest, fbest1, bbest, bbest1;
 214
 215			fbest = fbest1 = -1;
 216			for (d = fmax; d >= fmin; d -= 2) {
 217				i1 = XDL_MIN(kvdf[d], lim1);
 218				i2 = i1 - d;
 219				if (lim2 < i2) {
 220					i1 = lim2 + d;
 221					i2 = lim2;
 222				}
 223				if (fbest < i1 + i2) {
 224					fbest = i1 + i2;
 225					fbest1 = i1;
 226				}
 227			}
 228
 229			bbest = bbest1 = XDL_LINE_MAX;
 230			for (d = bmax; d >= bmin; d -= 2) {
 231				i1 = XDL_MAX(off1, kvdb[d]);
 232				i2 = i1 - d;
 233				if (i2 < off2) {
 234					i1 = off2 + d;
 235					i2 = off2;
 236				}
 237				if (i1 + i2 < bbest) {
 238					bbest = i1 + i2;
 239					bbest1 = i1;
 240				}
 241			}
 242
 243			if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
 244				spl->i1 = fbest1;
 245				spl->i2 = fbest - fbest1;
 246				spl->min_lo = 1;
 247				spl->min_hi = 0;
 248			} else {
 249				spl->i1 = bbest1;
 250				spl->i2 = bbest - bbest1;
 251				spl->min_lo = 0;
 252				spl->min_hi = 1;
 253			}
 254			return ec;
 255		}
 256	}
 257}
 258
 259
 260/*
 261 * Rule: "Divide et Impera" (divide & conquer). Recursively split the box in
 262 * sub-boxes by calling the box splitting function. Note that the real job
 263 * (marking changed lines) is done in the two boundary reaching checks.
 264 */
 265int xdl_recs_cmp(xdfile_t *xdf1, long off1, long lim1,
 266		 xdfile_t *xdf2, long off2, long lim2,
 267		 long *kvdf, long *kvdb, int need_min, xdalgoenv_t *xenv) {
 268
 269	/*
 270	 * Shrink the box by walking through each diagonal snake (SW and NE).
 271	 */
 272	for (; off1 < lim1 && off2 < lim2 && get_hash(xdf1, off1) == get_hash(xdf2, off2); off1++, off2++);
 273	for (; off1 < lim1 && off2 < lim2 && get_hash(xdf1, lim1 - 1) == get_hash(xdf2, lim2 - 1); lim1--, lim2--);
 274
 275	/*
 276	 * If one dimension is empty, then all records on the other one must
 277	 * be obviously changed.
 278	 */
 279	if (off1 == lim1) {
 280		for (; off2 < lim2; off2++)
 281			xdf2->changed[xdf2->rindex[off2]] = true;
 282	} else if (off2 == lim2) {
 283		for (; off1 < lim1; off1++)
 284			xdf1->changed[xdf1->rindex[off1]] = true;
 285	} else {
 286		xdpsplit_t spl;
 287		spl.i1 = spl.i2 = 0;
 288
 289		/*
 290		 * Divide ...
 291		 */
 292		if (xdl_split(xdf1, off1, lim1, xdf2, off2, lim2, kvdf, kvdb,
 293			      need_min, &spl, xenv) < 0) {
 294
 295			return -1;
 296		}
 297
 298		/*
 299		 * ... et Impera.
 300		 */
 301		if (xdl_recs_cmp(xdf1, off1, spl.i1, xdf2, off2, spl.i2,
 302				 kvdf, kvdb, spl.min_lo, xenv) < 0 ||
 303		    xdl_recs_cmp(xdf1, spl.i1, lim1, xdf2, spl.i2, lim2,
 304				 kvdf, kvdb, spl.min_hi, xenv) < 0) {
 305
 306			return -1;
 307		}
 308	}
 309
 310	return 0;
 311}
 312
 313
 314int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 315		xdfenv_t *xe) {
 316	long ndiags;
 317	long *kvd, *kvdf, *kvdb;
 318	xdalgoenv_t xenv;
 319	int res;
 320
 321	if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0)
 322		return -1;
 323
 324	if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF) {
 325		res = xdl_do_patience_diff(xpp, xe);
 326		goto out;
 327	}
 328
 329	if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF) {
 330		res = xdl_do_histogram_diff(xpp, xe);
 331		goto out;
 332	}
 333
 334	/*
 335	 * Allocate and setup K vectors to be used by the differential
 336	 * algorithm.
 337	 *
 338	 * One is to store the forward path and one to store the backward path.
 339	 */
 340	ndiags = xe->xdf1.nreff + xe->xdf2.nreff + 3;
 341	if (!XDL_ALLOC_ARRAY(kvd, 2 * ndiags + 2)) {
 342
 343		xdl_free_env(xe);
 344		return -1;
 345	}
 346	kvdf = kvd;
 347	kvdb = kvdf + ndiags;
 348	kvdf += xe->xdf2.nreff + 1;
 349	kvdb += xe->xdf2.nreff + 1;
 350
 351	xenv.mxcost = xdl_bogosqrt(ndiags);
 352	if (xenv.mxcost < XDL_MAX_COST_MIN)
 353		xenv.mxcost = XDL_MAX_COST_MIN;
 354	xenv.snake_cnt = XDL_SNAKE_CNT;
 355	xenv.heur_min = XDL_HEUR_MIN_COST;
 356
 357	res = xdl_recs_cmp(&xe->xdf1, 0, xe->xdf1.nreff, &xe->xdf2, 0, xe->xdf2.nreff,
 358			   kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0,
 359			   &xenv);
 360	xdl_free(kvd);
 361 out:
 362	if (res < 0)
 363		xdl_free_env(xe);
 364
 365	return res;
 366}
 367
 368
 369static xdchange_t *xdl_add_change(xdchange_t *xscr, long i1, long i2, long chg1, long chg2) {
 370	xdchange_t *xch;
 371
 372	if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
 373		return NULL;
 374
 375	xch->next = xscr;
 376	xch->i1 = i1;
 377	xch->i2 = i2;
 378	xch->chg1 = chg1;
 379	xch->chg2 = chg2;
 380	xch->ignore = 0;
 381
 382	return xch;
 383}
 384
 385
 386static int recs_match(xrecord_t *rec1, xrecord_t *rec2)
 387{
 388	return (rec1->ha == rec2->ha);
 389}
 390
 391/*
 392 * If a line is indented more than this, get_indent() just returns this value.
 393 * This avoids having to do absurd amounts of work for data that are not
 394 * human-readable text, and also ensures that the output of get_indent fits
 395 * within an int.
 396 */
 397#define MAX_INDENT 200
 398
 399/*
 400 * Return the amount of indentation of the specified line, treating TAB as 8
 401 * columns. Return -1 if line is empty or contains only whitespace. Clamp the
 402 * output value at MAX_INDENT.
 403 */
 404static int get_indent(xrecord_t *rec)
 405{
 406	long i;
 407	int ret = 0;
 408
 409	for (i = 0; i < rec->size; i++) {
 410		char c = rec->ptr[i];
 411
 412		if (!XDL_ISSPACE(c))
 413			return ret;
 414		else if (c == ' ')
 415			ret += 1;
 416		else if (c == '\t')
 417			ret += 8 - ret % 8;
 418		/* ignore other whitespace characters */
 419
 420		if (ret >= MAX_INDENT)
 421			return MAX_INDENT;
 422	}
 423
 424	/* The line contains only whitespace. */
 425	return -1;
 426}
 427
 428/*
 429 * If more than this number of consecutive blank rows are found, just return
 430 * this value. This avoids requiring O(N^2) work for pathological cases, and
 431 * also ensures that the output of score_split fits in an int.
 432 */
 433#define MAX_BLANKS 20
 434
 435/* Characteristics measured about a hypothetical split position. */
 436struct split_measurement {
 437	/*
 438	 * Is the split at the end of the file (aside from any blank lines)?
 439	 */
 440	int end_of_file;
 441
 442	/*
 443	 * How much is the line immediately following the split indented (or -1
 444	 * if the line is blank):
 445	 */
 446	int indent;
 447
 448	/*
 449	 * How many consecutive lines above the split are blank?
 450	 */
 451	int pre_blank;
 452
 453	/*
 454	 * How much is the nearest non-blank line above the split indented (or
 455	 * -1 if there is no such line)?
 456	 */
 457	int pre_indent;
 458
 459	/*
 460	 * How many lines after the line following the split are blank?
 461	 */
 462	int post_blank;
 463
 464	/*
 465	 * How much is the nearest non-blank line after the line following the
 466	 * split indented (or -1 if there is no such line)?
 467	 */
 468	int post_indent;
 469};
 470
 471struct split_score {
 472	/* The effective indent of this split (smaller is preferred). */
 473	int effective_indent;
 474
 475	/* Penalty for this split (smaller is preferred). */
 476	int penalty;
 477};
 478
 479/*
 480 * Fill m with information about a hypothetical split of xdf above line split.
 481 */
 482static void measure_split(const xdfile_t *xdf, long split,
 483			  struct split_measurement *m)
 484{
 485	long i;
 486
 487	if (split >= xdf->nrec) {
 488		m->end_of_file = 1;
 489		m->indent = -1;
 490	} else {
 491		m->end_of_file = 0;
 492		m->indent = get_indent(&xdf->recs[split]);
 493	}
 494
 495	m->pre_blank = 0;
 496	m->pre_indent = -1;
 497	for (i = split - 1; i >= 0; i--) {
 498		m->pre_indent = get_indent(&xdf->recs[i]);
 499		if (m->pre_indent != -1)
 500			break;
 501		m->pre_blank += 1;
 502		if (m->pre_blank == MAX_BLANKS) {
 503			m->pre_indent = 0;
 504			break;
 505		}
 506	}
 507
 508	m->post_blank = 0;
 509	m->post_indent = -1;
 510	for (i = split + 1; i < xdf->nrec; i++) {
 511		m->post_indent = get_indent(&xdf->recs[i]);
 512		if (m->post_indent != -1)
 513			break;
 514		m->post_blank += 1;
 515		if (m->post_blank == MAX_BLANKS) {
 516			m->post_indent = 0;
 517			break;
 518		}
 519	}
 520}
 521
 522/*
 523 * The empirically-determined weight factors used by score_split() below.
 524 * Larger values means that the position is a less favorable place to split.
 525 *
 526 * Note that scores are only ever compared against each other, so multiplying
 527 * all of these weight/penalty values by the same factor wouldn't change the
 528 * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*.
 529 * In practice, these numbers are chosen to be large enough that they can be
 530 * adjusted relative to each other with sufficient precision despite using
 531 * integer math.
 532 */
 533
 534/* Penalty if there are no non-blank lines before the split */
 535#define START_OF_FILE_PENALTY 1
 536
 537/* Penalty if there are no non-blank lines after the split */
 538#define END_OF_FILE_PENALTY 21
 539
 540/* Multiplier for the number of blank lines around the split */
 541#define TOTAL_BLANK_WEIGHT (-30)
 542
 543/* Multiplier for the number of blank lines after the split */
 544#define POST_BLANK_WEIGHT 6
 545
 546/*
 547 * Penalties applied if the line is indented more than its predecessor
 548 */
 549#define RELATIVE_INDENT_PENALTY (-4)
 550#define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
 551
 552/*
 553 * Penalties applied if the line is indented less than both its predecessor and
 554 * its successor
 555 */
 556#define RELATIVE_OUTDENT_PENALTY 24
 557#define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
 558
 559/*
 560 * Penalties applied if the line is indented less than its predecessor but not
 561 * less than its successor
 562 */
 563#define RELATIVE_DEDENT_PENALTY 23
 564#define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
 565
 566/*
 567 * We only consider whether the sum of the effective indents for splits are
 568 * less than (-1), equal to (0), or greater than (+1) each other. The resulting
 569 * value is multiplied by the following weight and combined with the penalty to
 570 * determine the better of two scores.
 571 */
 572#define INDENT_WEIGHT 60
 573
 574/*
 575 * How far do we slide a hunk at most?
 576 */
 577#define INDENT_HEURISTIC_MAX_SLIDING 100
 578
 579/*
 580 * Compute a badness score for the hypothetical split whose measurements are
 581 * stored in m. The weight factors were determined empirically using the tools
 582 * and corpus described in
 583 *
 584 *     https://github.com/mhagger/diff-slider-tools
 585 *
 586 * Also see that project if you want to improve the weights based on, for
 587 * example, a larger or more diverse corpus.
 588 */
 589static void score_add_split(const struct split_measurement *m, struct split_score *s)
 590{
 591	/*
 592	 * A place to accumulate penalty factors (positive makes this index more
 593	 * favored):
 594	 */
 595	int post_blank, total_blank, indent, any_blanks;
 596
 597	if (m->pre_indent == -1 && m->pre_blank == 0)
 598		s->penalty += START_OF_FILE_PENALTY;
 599
 600	if (m->end_of_file)
 601		s->penalty += END_OF_FILE_PENALTY;
 602
 603	/*
 604	 * Set post_blank to the number of blank lines following the split,
 605	 * including the line immediately after the split:
 606	 */
 607	post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
 608	total_blank = m->pre_blank + post_blank;
 609
 610	/* Penalties based on nearby blank lines: */
 611	s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
 612	s->penalty += POST_BLANK_WEIGHT * post_blank;
 613
 614	if (m->indent != -1)
 615		indent = m->indent;
 616	else
 617		indent = m->post_indent;
 618
 619	any_blanks = (total_blank != 0);
 620
 621	/* Note that the effective indent is -1 at the end of the file: */
 622	s->effective_indent += indent;
 623
 624	if (indent == -1) {
 625		/* No additional adjustments needed. */
 626	} else if (m->pre_indent == -1) {
 627		/* No additional adjustments needed. */
 628	} else if (indent > m->pre_indent) {
 629		/*
 630		 * The line is indented more than its predecessor.
 631		 */
 632		s->penalty += any_blanks ?
 633			RELATIVE_INDENT_WITH_BLANK_PENALTY :
 634			RELATIVE_INDENT_PENALTY;
 635	} else if (indent == m->pre_indent) {
 636		/*
 637		 * The line has the same indentation level as its predecessor.
 638		 * No additional adjustments needed.
 639		 */
 640	} else {
 641		/*
 642		 * The line is indented less than its predecessor. It could be
 643		 * the block terminator of the previous block, but it could
 644		 * also be the start of a new block (e.g., an "else" block, or
 645		 * maybe the previous block didn't have a block terminator).
 646		 * Try to distinguish those cases based on what comes next:
 647		 */
 648		if (m->post_indent != -1 && m->post_indent > indent) {
 649			/*
 650			 * The following line is indented more. So it is likely
 651			 * that this line is the start of a block.
 652			 */
 653			s->penalty += any_blanks ?
 654				RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
 655				RELATIVE_OUTDENT_PENALTY;
 656		} else {
 657			/*
 658			 * That was probably the end of a block.
 659			 */
 660			s->penalty += any_blanks ?
 661				RELATIVE_DEDENT_WITH_BLANK_PENALTY :
 662				RELATIVE_DEDENT_PENALTY;
 663		}
 664	}
 665}
 666
 667static int score_cmp(struct split_score *s1, struct split_score *s2)
 668{
 669	/* -1 if s1.effective_indent < s2->effective_indent, etc. */
 670	int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
 671			   (s1->effective_indent < s2->effective_indent));
 672
 673	return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
 674}
 675
 676/*
 677 * Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
 678 * of lines that was inserted or deleted from the corresponding version of the
 679 * file). We consider there to be such a group at the beginning of the file, at
 680 * the end of the file, and between any two unchanged lines, though most such
 681 * groups will usually be empty.
 682 *
 683 * If the first line in a group is equal to the line following the group, then
 684 * the group can be slid down. Similarly, if the last line in a group is equal
 685 * to the line preceding the group, then the group can be slid up. See
 686 * group_slide_down() and group_slide_up().
 687 *
 688 * Note that loops that are testing for changed lines in xdf->rchg do not need
 689 * index bounding since the array is prepared with a zero at position -1 and N.
 690 */
 691struct xdlgroup {
 692	/*
 693	 * The index of the first changed line in the group, or the index of
 694	 * the unchanged line above which the (empty) group is located.
 695	 */
 696	long start;
 697
 698	/*
 699	 * The index of the first unchanged line after the group. For an empty
 700	 * group, end is equal to start.
 701	 */
 702	long end;
 703};
 704
 705/*
 706 * Initialize g to point at the first group in xdf.
 707 */
 708static void group_init(xdfile_t *xdf, struct xdlgroup *g)
 709{
 710	g->start = g->end = 0;
 711	while (xdf->changed[g->end])
 712		g->end++;
 713}
 714
 715/*
 716 * Move g to describe the next (possibly empty) group in xdf and return 0. If g
 717 * is already at the end of the file, do nothing and return -1.
 718 */
 719static inline int group_next(xdfile_t *xdf, struct xdlgroup *g)
 720{
 721	if (g->end == xdf->nrec)
 722		return -1;
 723
 724	g->start = g->end + 1;
 725	for (g->end = g->start; xdf->changed[g->end]; g->end++)
 726		;
 727
 728	return 0;
 729}
 730
 731/*
 732 * Move g to describe the previous (possibly empty) group in xdf and return 0.
 733 * If g is already at the beginning of the file, do nothing and return -1.
 734 */
 735static inline int group_previous(xdfile_t *xdf, struct xdlgroup *g)
 736{
 737	if (g->start == 0)
 738		return -1;
 739
 740	g->end = g->start - 1;
 741	for (g->start = g->end; xdf->changed[g->start - 1]; g->start--)
 742		;
 743
 744	return 0;
 745}
 746
 747/*
 748 * If g can be slid toward the end of the file, do so, and if it bumps into a
 749 * following group, expand this group to include it. Return 0 on success or -1
 750 * if g cannot be slid down.
 751 */
 752static int group_slide_down(xdfile_t *xdf, struct xdlgroup *g)
 753{
 754	if (g->end < xdf->nrec &&
 755	    recs_match(&xdf->recs[g->start], &xdf->recs[g->end])) {
 756		xdf->changed[g->start++] = false;
 757		xdf->changed[g->end++] = true;
 758
 759		while (xdf->changed[g->end])
 760			g->end++;
 761
 762		return 0;
 763	} else {
 764		return -1;
 765	}
 766}
 767
 768/*
 769 * If g can be slid toward the beginning of the file, do so, and if it bumps
 770 * into a previous group, expand this group to include it. Return 0 on success
 771 * or -1 if g cannot be slid up.
 772 */
 773static int group_slide_up(xdfile_t *xdf, struct xdlgroup *g)
 774{
 775	if (g->start > 0 &&
 776	    recs_match(&xdf->recs[g->start - 1], &xdf->recs[g->end - 1])) {
 777		xdf->changed[--g->start] = true;
 778		xdf->changed[--g->end] = false;
 779
 780		while (xdf->changed[g->start - 1])
 781			g->start--;
 782
 783		return 0;
 784	} else {
 785		return -1;
 786	}
 787}
 788
 789/*
 790 * Move back and forward change groups for a consistent and pretty diff output.
 791 * This also helps in finding joinable change groups and reducing the diff
 792 * size.
 793 */
 794int xdl_change_compact(xdfile_t *xdf, xdfile_t *xdfo, long flags) {
 795	struct xdlgroup g, go;
 796	long earliest_end, end_matching_other;
 797	long groupsize;
 798
 799	group_init(xdf, &g);
 800	group_init(xdfo, &go);
 801
 802	while (1) {
 803		/*
 804		 * If the group is empty in the to-be-compacted file, skip it:
 805		 */
 806		if (g.end == g.start)
 807			goto next;
 808
 809		/*
 810		 * Now shift the change up and then down as far as possible in
 811		 * each direction. If it bumps into any other changes, merge
 812		 * them.
 813		 */
 814		do {
 815			groupsize = g.end - g.start;
 816
 817			/*
 818			 * Keep track of the last "end" index that causes this
 819			 * group to align with a group of changed lines in the
 820			 * other file. -1 indicates that we haven't found such
 821			 * a match yet:
 822			 */
 823			end_matching_other = -1;
 824
 825			/* Shift the group backward as much as possible: */
 826			while (!group_slide_up(xdf, &g))
 827				if (group_previous(xdfo, &go))
 828					BUG("group sync broken sliding up");
 829
 830			/*
 831			 * This is this highest that this group can be shifted.
 832			 * Record its end index:
 833			 */
 834			earliest_end = g.end;
 835
 836			if (go.end > go.start)
 837				end_matching_other = g.end;
 838
 839			/* Now shift the group forward as far as possible: */
 840			while (1) {
 841				if (group_slide_down(xdf, &g))
 842					break;
 843				if (group_next(xdfo, &go))
 844					BUG("group sync broken sliding down");
 845
 846				if (go.end > go.start)
 847					end_matching_other = g.end;
 848			}
 849		} while (groupsize != g.end - g.start);
 850
 851		/*
 852		 * If the group can be shifted, then we can possibly use this
 853		 * freedom to produce a more intuitive diff.
 854		 *
 855		 * The group is currently shifted as far down as possible, so
 856		 * the heuristics below only have to handle upwards shifts.
 857		 */
 858
 859		if (g.end == earliest_end) {
 860			/* no shifting was possible */
 861		} else if (end_matching_other != -1) {
 862			/*
 863			 * Move the possibly merged group of changes back to
 864			 * line up with the last group of changes from the
 865			 * other file that it can align with.
 866			 */
 867			while (go.end == go.start) {
 868				if (group_slide_up(xdf, &g))
 869					BUG("match disappeared");
 870				if (group_previous(xdfo, &go))
 871					BUG("group sync broken sliding to match");
 872			}
 873		} else if (flags & XDF_INDENT_HEURISTIC) {
 874			/*
 875			 * Indent heuristic: a group of pure add/delete lines
 876			 * implies two splits, one between the end of the
 877			 * "before" context and the start of the group, and
 878			 * another between the end of the group and the
 879			 * beginning of the "after" context. Some splits are
 880			 * aesthetically better and some are worse. We compute
 881			 * a badness "score" for each split, and add the scores
 882			 * for the two splits to define a "score" for each
 883			 * position that the group can be shifted to. Then we
 884			 * pick the shift with the lowest score.
 885			 */
 886			long shift, best_shift = -1;
 887			struct split_score best_score;
 888
 889			shift = earliest_end;
 890			if (g.end - groupsize - 1 > shift)
 891				shift = g.end - groupsize - 1;
 892			if (g.end - INDENT_HEURISTIC_MAX_SLIDING > shift)
 893				shift = g.end - INDENT_HEURISTIC_MAX_SLIDING;
 894			for (; shift <= g.end; shift++) {
 895				struct split_measurement m;
 896				struct split_score score = {0, 0};
 897
 898				measure_split(xdf, shift, &m);
 899				score_add_split(&m, &score);
 900				measure_split(xdf, shift - groupsize, &m);
 901				score_add_split(&m, &score);
 902				if (best_shift == -1 ||
 903				    score_cmp(&score, &best_score) <= 0) {
 904					best_score.effective_indent = score.effective_indent;
 905					best_score.penalty = score.penalty;
 906					best_shift = shift;
 907				}
 908			}
 909
 910			while (g.end > best_shift) {
 911				if (group_slide_up(xdf, &g))
 912					BUG("best shift unreached");
 913				if (group_previous(xdfo, &go))
 914					BUG("group sync broken sliding to blank line");
 915			}
 916		}
 917
 918	next:
 919		/* Move past the just-processed group: */
 920		if (group_next(xdf, &g))
 921			break;
 922		if (group_next(xdfo, &go))
 923			BUG("group sync broken moving to next group");
 924	}
 925
 926	if (!group_next(xdfo, &go))
 927		BUG("group sync broken at end of file");
 928
 929	return 0;
 930}
 931
 932
 933int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr) {
 934	xdchange_t *cscr = NULL, *xch;
 935	bool *changed1 = xe->xdf1.changed, *changed2 = xe->xdf2.changed;
 936	long i1, i2, l1, l2;
 937
 938	/*
 939	 * Trivial. Collects "groups" of changes and creates an edit script.
 940	 */
 941	for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 || i2 >= 0; i1--, i2--)
 942		if (changed1[i1 - 1] || changed2[i2 - 1]) {
 943			for (l1 = i1; changed1[i1 - 1]; i1--);
 944			for (l2 = i2; changed2[i2 - 1]; i2--);
 945
 946			if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
 947				xdl_free_script(cscr);
 948				return -1;
 949			}
 950			cscr = xch;
 951		}
 952
 953	*xscr = cscr;
 954
 955	return 0;
 956}
 957
 958
 959void xdl_free_script(xdchange_t *xscr) {
 960	xdchange_t *xch;
 961
 962	while ((xch = xscr) != NULL) {
 963		xscr = xscr->next;
 964		xdl_free(xch);
 965	}
 966}
 967
 968static int xdl_call_hunk_func(xdfenv_t *xe UNUSED, xdchange_t *xscr, xdemitcb_t *ecb,
 969			      xdemitconf_t const *xecfg)
 970{
 971	xdchange_t *xch, *xche;
 972
 973	for (xch = xscr; xch; xch = xche->next) {
 974		xche = xdl_get_hunk(&xch, xecfg);
 975		if (!xch)
 976			break;
 977		if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
 978				     xch->i2, xche->i2 + xche->chg2 - xch->i2,
 979				     ecb->priv) < 0)
 980			return -1;
 981	}
 982	return 0;
 983}
 984
 985static void xdl_mark_ignorable_lines(xdchange_t *xscr, xdfenv_t *xe, long flags)
 986{
 987	xdchange_t *xch;
 988
 989	for (xch = xscr; xch; xch = xch->next) {
 990		int ignore = 1;
 991		xrecord_t *rec;
 992		long i;
 993
 994		rec = &xe->xdf1.recs[xch->i1];
 995		for (i = 0; i < xch->chg1 && ignore; i++)
 996			ignore = xdl_blankline(rec[i].ptr, rec[i].size, flags);
 997
 998		rec = &xe->xdf2.recs[xch->i2];
 999		for (i = 0; i < xch->chg2 && ignore; i++)
1000			ignore = xdl_blankline(rec[i].ptr, rec[i].size, flags);
1001
1002		xch->ignore = ignore;
1003	}
1004}
1005
1006static int record_matches_regex(xrecord_t *rec, xpparam_t const *xpp) {
1007	regmatch_t regmatch;
1008	size_t i;
1009
1010	for (i = 0; i < xpp->ignore_regex_nr; i++)
1011		if (!regexec_buf(xpp->ignore_regex[i], rec->ptr, rec->size, 1,
1012				 &regmatch, 0))
1013			return 1;
1014
1015	return 0;
1016}
1017
1018static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe,
1019				     xpparam_t const *xpp)
1020{
1021	xdchange_t *xch;
1022
1023	for (xch = xscr; xch; xch = xch->next) {
1024		xrecord_t *rec;
1025		int ignore = 1;
1026		long i;
1027
1028		/*
1029		 * Do not override --ignore-blank-lines.
1030		 */
1031		if (xch->ignore)
1032			continue;
1033
1034		rec = &xe->xdf1.recs[xch->i1];
1035		for (i = 0; i < xch->chg1 && ignore; i++)
1036			ignore = record_matches_regex(&rec[i], xpp);
1037
1038		rec = &xe->xdf2.recs[xch->i2];
1039		for (i = 0; i < xch->chg2 && ignore; i++)
1040			ignore = record_matches_regex(&rec[i], xpp);
1041
1042		xch->ignore = ignore;
1043	}
1044}
1045
1046int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
1047	     xdemitconf_t const *xecfg, xdemitcb_t *ecb) {
1048	xdchange_t *xscr;
1049	xdfenv_t xe;
1050	emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1051
1052	if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1053
1054		return -1;
1055	}
1056	if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 ||
1057	    xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 ||
1058	    xdl_build_script(&xe, &xscr) < 0) {
1059
1060		xdl_free_env(&xe);
1061		return -1;
1062	}
1063	if (xscr) {
1064		if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1065			xdl_mark_ignorable_lines(xscr, &xe, xpp->flags);
1066
1067		if (xpp->ignore_regex)
1068			xdl_mark_ignorable_regex(xscr, &xe, xpp);
1069
1070		if (ef(&xe, xscr, ecb, xecfg) < 0) {
1071
1072			xdl_free_script(xscr);
1073			xdl_free_env(&xe);
1074			return -1;
1075		}
1076		xdl_free_script(xscr);
1077	}
1078	xdl_free_env(&xe);
1079
1080	return 0;
1081}