lib/src/diff.rs at globpattern · ilyagr.bsky.social/jj

ilyagr.bsky.social / jj
fork atom
just playing with tangled
fork atom
jj / lib / src / diff.rs
at globpattern 1776 lines 62 kB view raw
wrap content
Ilya Grigoriev cleanup: enable `unused_trait_names` clippy lint and run `clippy --fix` 1y ago
acaedc33
   1// Copyright 2021 The Jujutsu Authors
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7// https://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15#![allow(missing_docs)]
  16
  17use std::collections::BTreeMap;
  18use std::hash::BuildHasher;
  19use std::hash::Hash;
  20use std::hash::Hasher;
  21use std::hash::RandomState;
  22use std::iter;
  23use std::ops::Range;
  24use std::slice;
  25
  26use bstr::BStr;
  27use hashbrown::HashTable;
  28use itertools::Itertools as _;
  29use smallvec::smallvec;
  30use smallvec::SmallVec;
  31
  32pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
  33    text.split_inclusive(|b| *b == b'\n')
  34        .scan(0, |total, line| {
  35            let start = *total;
  36            *total += line.len();
  37            Some(start..*total)
  38        })
  39        .collect()
  40}
  41
  42fn is_word_byte(b: u8) -> bool {
  43    // TODO: Make this configurable (probably higher up in the call stack)
  44    matches!(
  45        b,
  46        // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are
  47        // treated as a single unit.
  48        b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff'
  49    )
  50}
  51
  52pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
  53    let mut word_ranges = vec![];
  54    let mut word_start_pos = 0;
  55    let mut in_word = false;
  56    for (i, b) in text.iter().enumerate() {
  57        if in_word && !is_word_byte(*b) {
  58            in_word = false;
  59            word_ranges.push(word_start_pos..i);
  60            word_start_pos = i;
  61        } else if !in_word && is_word_byte(*b) {
  62            in_word = true;
  63            word_start_pos = i;
  64        }
  65    }
  66    if in_word && word_start_pos < text.len() {
  67        word_ranges.push(word_start_pos..text.len());
  68    }
  69    word_ranges
  70}
  71
  72pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
  73    text.iter()
  74        .positions(|b| !is_word_byte(*b))
  75        .map(|i| i..i + 1)
  76        .collect()
  77}
  78
  79fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
  80    text.iter().copied().filter(|b| !b.is_ascii_whitespace())
  81}
  82
  83fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> {
  84    let mut prev_was_space = false;
  85    text.iter().filter_map(move |&b| {
  86        let was_space = prev_was_space;
  87        let is_space = b.is_ascii_whitespace();
  88        prev_was_space = is_space;
  89        match (was_space, is_space) {
  90            (_, false) => Some(b),
  91            (false, true) => Some(b' '),
  92            (true, true) => None,
  93        }
  94    })
  95}
  96
  97fn hash_with_length_suffix<I, H>(data: I, state: &mut H)
  98where
  99    I: IntoIterator,
 100    I::Item: Hash,
 101    H: Hasher,
 102{
 103    let mut len: usize = 0;
 104    for d in data {
 105        d.hash(state);
 106        len += 1;
 107    }
 108    state.write_usize(len);
 109}
 110
 111/// Compares byte sequences based on a certain equivalence property.
 112///
 113/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison
 114/// object for the following reasons:
 115///
 116/// a. If it were newtype, a generic `wrap` function would be needed. It
 117///    couldn't be expressed as a simple closure:
 118///    `for<'a> Fn(&'a [u8]) -> ???<'a>`
 119/// b. Dynamic comparison object can be implemented intuitively. For example,
 120///    `pattern: &Regex` would have to be copied to all newtype instances if it
 121///    were newtype.
 122/// c. Hash values can be cached if hashing is controlled externally.
 123pub trait CompareBytes {
 124    /// Returns true if `left` and `right` are equivalent.
 125    fn eq(&self, left: &[u8], right: &[u8]) -> bool;
 126
 127    /// Generates hash which respects the following property:
 128    /// `eq(left, right) => hash(left) == hash(right)`
 129    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
 130}
 131
 132// An instance might have e.g. Regex pattern, which can't be trivially copied.
 133// Such comparison object can be passed by reference.
 134impl<C: CompareBytes + ?Sized> CompareBytes for &C {
 135    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
 136        <C as CompareBytes>::eq(self, left, right)
 137    }
 138
 139    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
 140        <C as CompareBytes>::hash(self, text, state);
 141    }
 142}
 143
 144/// Compares byte sequences literally.
 145#[derive(Clone, Debug, Default)]
 146pub struct CompareBytesExactly;
 147
 148impl CompareBytes for CompareBytesExactly {
 149    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
 150        left == right
 151    }
 152
 153    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
 154        text.hash(state);
 155    }
 156}
 157
 158/// Compares byte sequences ignoring any whitespace occurrences.
 159#[derive(Clone, Debug, Default)]
 160pub struct CompareBytesIgnoreAllWhitespace;
 161
 162impl CompareBytes for CompareBytesIgnoreAllWhitespace {
 163    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
 164        bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right))
 165    }
 166
 167    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
 168        hash_with_length_suffix(bytes_ignore_all_whitespace(text), state);
 169    }
 170}
 171
 172/// Compares byte sequences ignoring changes in whitespace amount.
 173#[derive(Clone, Debug, Default)]
 174pub struct CompareBytesIgnoreWhitespaceAmount;
 175
 176impl CompareBytes for CompareBytesIgnoreWhitespaceAmount {
 177    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
 178        bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right))
 179    }
 180
 181    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
 182        hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state);
 183    }
 184}
 185
 186// Not implementing Eq because the text should be compared by WordComparator.
 187#[derive(Clone, Copy, Debug)]
 188struct HashedWord<'input> {
 189    hash: u64,
 190    text: &'input BStr,
 191}
 192
 193/// Compares words (or tokens) under a certain hasher configuration.
 194#[derive(Clone, Debug, Default)]
 195struct WordComparator<C, S> {
 196    compare: C,
 197    hash_builder: S,
 198}
 199
 200impl<C: CompareBytes> WordComparator<C, RandomState> {
 201    fn new(compare: C) -> Self {
 202        WordComparator {
 203            compare,
 204            // TODO: switch to ahash for better performance?
 205            hash_builder: RandomState::new(),
 206        }
 207    }
 208}
 209
 210impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
 211    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
 212        self.compare.eq(left, right)
 213    }
 214
 215    fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool {
 216        left.hash == right.hash && self.compare.eq(left.text, right.text)
 217    }
 218
 219    fn hash_one(&self, text: &[u8]) -> u64 {
 220        let mut state = self.hash_builder.build_hasher();
 221        self.compare.hash(text, &mut state);
 222        state.finish()
 223    }
 224}
 225
 226/// Index in a list of word (or token) ranges in `DiffSource`.
 227#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 228struct WordPosition(usize);
 229
 230/// Index in a list of word (or token) ranges in `LocalDiffSource`.
 231#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 232struct LocalWordPosition(usize);
 233
 234#[derive(Clone, Debug)]
 235struct DiffSource<'input, 'aux> {
 236    text: &'input BStr,
 237    ranges: &'aux [Range<usize>],
 238    hashes: Vec<u64>,
 239}
 240
 241impl<'input, 'aux> DiffSource<'input, 'aux> {
 242    fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>(
 243        text: &'input T,
 244        ranges: &'aux [Range<usize>],
 245        comp: &WordComparator<C, S>,
 246    ) -> Self {
 247        let text = BStr::new(text);
 248        let hashes = ranges
 249            .iter()
 250            .map(|range| comp.hash_one(&text[range.clone()]))
 251            .collect();
 252        DiffSource {
 253            text,
 254            ranges,
 255            hashes,
 256        }
 257    }
 258
 259    fn local(&self) -> LocalDiffSource<'input, '_> {
 260        LocalDiffSource {
 261            text: self.text,
 262            ranges: self.ranges,
 263            hashes: &self.hashes,
 264            global_offset: WordPosition(0),
 265        }
 266    }
 267
 268    fn range_at(&self, position: WordPosition) -> Range<usize> {
 269        self.ranges[position.0].clone()
 270    }
 271}
 272
 273#[derive(Clone, Debug)]
 274struct LocalDiffSource<'input, 'aux> {
 275    text: &'input BStr,
 276    ranges: &'aux [Range<usize>],
 277    hashes: &'aux [u64],
 278    /// The number of preceding word ranges excluded from the self `ranges`.
 279    global_offset: WordPosition,
 280}
 281
 282impl<'input> LocalDiffSource<'input, '_> {
 283    fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self {
 284        LocalDiffSource {
 285            text: self.text,
 286            ranges: &self.ranges[positions.start.0..positions.end.0],
 287            hashes: &self.hashes[positions.start.0..positions.end.0],
 288            global_offset: self.map_to_global(positions.start),
 289        }
 290    }
 291
 292    fn map_to_global(&self, position: LocalWordPosition) -> WordPosition {
 293        WordPosition(self.global_offset.0 + position.0)
 294    }
 295
 296    fn hashed_words(
 297        &self,
 298    ) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + use<'_, 'input>
 299    {
 300        iter::zip(self.ranges, self.hashes).map(|(range, &hash)| {
 301            let text = &self.text[range.clone()];
 302            HashedWord { hash, text }
 303        })
 304    }
 305}
 306
 307struct Histogram<'input> {
 308    word_to_positions: HashTable<HistogramEntry<'input>>,
 309}
 310
 311// Many of the words are unique. We can inline up to 2 word positions (16 bytes
 312// on 64-bit platform) in SmallVec for free.
 313type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>);
 314
 315impl<'input> Histogram<'input> {
 316    fn calculate<C: CompareBytes, S: BuildHasher>(
 317        source: &LocalDiffSource<'input, '_>,
 318        comp: &WordComparator<C, S>,
 319        max_occurrences: usize,
 320    ) -> Self {
 321        let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new();
 322        for (i, word) in source.hashed_words().enumerate() {
 323            let pos = LocalWordPosition(i);
 324            word_to_positions
 325                .entry(
 326                    word.hash,
 327                    |(w, _)| comp.eq(w.text, word.text),
 328                    |(w, _)| w.hash,
 329                )
 330                .and_modify(|(_, positions)| {
 331                    // Allow one more than max_occurrences, so we can later skip
 332                    // those with more than max_occurrences
 333                    if positions.len() <= max_occurrences {
 334                        positions.push(pos);
 335                    }
 336                })
 337                .or_insert_with(|| (word, smallvec![pos]));
 338        }
 339        Histogram { word_to_positions }
 340    }
 341
 342    fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> {
 343        let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new();
 344        for entry in &self.word_to_positions {
 345            let (_, positions) = entry;
 346            let entries = count_to_entries.entry(positions.len()).or_default();
 347            entries.push(entry);
 348        }
 349        count_to_entries
 350    }
 351
 352    fn positions_by_word<C: CompareBytes, S: BuildHasher>(
 353        &self,
 354        word: HashedWord<'input>,
 355        comp: &WordComparator<C, S>,
 356    ) -> Option<&[LocalWordPosition]> {
 357        let (_, positions) = self
 358            .word_to_positions
 359            .find(word.hash, |(w, _)| comp.eq(w.text, word.text))?;
 360        Some(positions)
 361    }
 362}
 363
 364/// Finds the LCS given a array where the value of `input[i]` indicates that
 365/// the position of element `i` in the right array is at position `input[i]` in
 366/// the left array.
 367///
 368/// For example (some have multiple valid outputs):
 369///
 370/// [0,1,2] => [(0,0),(1,1),(2,2)]
 371/// [2,1,0] => [(0,2)]
 372/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)]
 373/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)]
 374fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
 375    if input.is_empty() {
 376        return vec![];
 377    }
 378
 379    let mut chain = vec![(0, 0, 0); input.len()];
 380    let mut global_longest = 0;
 381    let mut global_longest_right_pos = 0;
 382    for (right_pos, &left_pos) in input.iter().enumerate() {
 383        let mut longest_from_here = 1;
 384        let mut previous_right_pos = usize::MAX;
 385        for i in (0..right_pos).rev() {
 386            let (previous_len, previous_left_pos, _) = chain[i];
 387            if previous_left_pos < left_pos {
 388                let len = previous_len + 1;
 389                if len > longest_from_here {
 390                    longest_from_here = len;
 391                    previous_right_pos = i;
 392                    if len > global_longest {
 393                        global_longest = len;
 394                        global_longest_right_pos = right_pos;
 395                        // If this is the longest chain globally so far, we cannot find a
 396                        // longer one by using a previous value, so break early.
 397                        break;
 398                    }
 399                }
 400            }
 401        }
 402        chain[right_pos] = (longest_from_here, left_pos, previous_right_pos);
 403    }
 404
 405    let mut result = vec![];
 406    let mut right_pos = global_longest_right_pos;
 407    loop {
 408        let (_, left_pos, previous_right_pos) = chain[right_pos];
 409        result.push((left_pos, right_pos));
 410        if previous_right_pos == usize::MAX {
 411            break;
 412        }
 413        right_pos = previous_right_pos;
 414    }
 415    result.reverse();
 416
 417    result
 418}
 419
 420/// Finds unchanged word (or token) positions among the ones given as
 421/// arguments. The data between those words is ignored.
 422fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>(
 423    found_positions: &mut Vec<(WordPosition, WordPosition)>,
 424    left: &LocalDiffSource,
 425    right: &LocalDiffSource,
 426    comp: &WordComparator<C, S>,
 427) {
 428    if left.ranges.is_empty() || right.ranges.is_empty() {
 429        return;
 430    }
 431
 432    // Prioritize LCS-based algorithm than leading/trailing matches
 433    let old_len = found_positions.len();
 434    collect_unchanged_words_lcs(found_positions, left, right, comp);
 435    if found_positions.len() != old_len {
 436        return;
 437    }
 438
 439    // Trim leading common ranges (i.e. grow previous unchanged region)
 440    let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words())
 441        .take_while(|&(l, r)| comp.eq_hashed(l, r))
 442        .count();
 443    let left_hashed_words = left.hashed_words().skip(common_leading_len);
 444    let right_hashed_words = right.hashed_words().skip(common_leading_len);
 445
 446    // Trim trailing common ranges (i.e. grow next unchanged region)
 447    let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev())
 448        .take_while(|&(l, r)| comp.eq_hashed(l, r))
 449        .count();
 450
 451    found_positions.extend(itertools::chain(
 452        (0..common_leading_len).map(|i| {
 453            (
 454                left.map_to_global(LocalWordPosition(i)),
 455                right.map_to_global(LocalWordPosition(i)),
 456            )
 457        }),
 458        (1..=common_trailing_len).rev().map(|i| {
 459            (
 460                left.map_to_global(LocalWordPosition(left.ranges.len() - i)),
 461                right.map_to_global(LocalWordPosition(right.ranges.len() - i)),
 462            )
 463        }),
 464    ));
 465}
 466
 467fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>(
 468    found_positions: &mut Vec<(WordPosition, WordPosition)>,
 469    left: &LocalDiffSource,
 470    right: &LocalDiffSource,
 471    comp: &WordComparator<C, S>,
 472) {
 473    let max_occurrences = 100;
 474    let left_histogram = Histogram::calculate(left, comp, max_occurrences);
 475    let left_count_to_entries = left_histogram.build_count_to_entries();
 476    if *left_count_to_entries.keys().next().unwrap() > max_occurrences {
 477        // If there are very many occurrences of all words, then we just give up.
 478        return;
 479    }
 480    let right_histogram = Histogram::calculate(right, comp, max_occurrences);
 481    // Look for words with few occurrences in `left` (could equally well have picked
 482    // `right`?). If any of them also occur in `right`, then we add the words to
 483    // the LCS.
 484    let Some(uncommon_shared_word_positions) =
 485        left_count_to_entries.values().find_map(|left_entries| {
 486            let mut both_positions = left_entries
 487                .iter()
 488                .filter_map(|&(word, left_positions)| {
 489                    let right_positions = right_histogram.positions_by_word(*word, comp)?;
 490                    (left_positions.len() == right_positions.len())
 491                        .then_some((left_positions, right_positions))
 492                })
 493                .peekable();
 494            both_positions.peek().is_some().then_some(both_positions)
 495        })
 496    else {
 497        return;
 498    };
 499
 500    // [(index into ranges, serial to identify {word, occurrence #})]
 501    let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) =
 502        uncommon_shared_word_positions
 503            .flat_map(|(lefts, rights)| iter::zip(lefts, rights))
 504            .enumerate()
 505            .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial)))
 506            .unzip();
 507    left_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
 508    right_positions.sort_unstable_by_key(|&(pos, _serial)| pos);
 509    let left_index_by_right_index: Vec<usize> = {
 510        let mut left_index_map = vec![0; left_positions.len()];
 511        for (i, &(_pos, serial)) in left_positions.iter().enumerate() {
 512            left_index_map[serial] = i;
 513        }
 514        right_positions
 515            .iter()
 516            .map(|&(_pos, serial)| left_index_map[serial])
 517            .collect()
 518    };
 519
 520    let lcs = find_lcs(&left_index_by_right_index);
 521
 522    // Produce output word positions, recursing into the modified areas between
 523    // the elements in the LCS.
 524    let mut previous_left_position = LocalWordPosition(0);
 525    let mut previous_right_position = LocalWordPosition(0);
 526    for (left_index, right_index) in lcs {
 527        let (left_position, _) = left_positions[left_index];
 528        let (right_position, _) = right_positions[right_index];
 529        collect_unchanged_words(
 530            found_positions,
 531            &left.narrowed(previous_left_position..left_position),
 532            &right.narrowed(previous_right_position..right_position),
 533            comp,
 534        );
 535        found_positions.push((
 536            left.map_to_global(left_position),
 537            right.map_to_global(right_position),
 538        ));
 539        previous_left_position = LocalWordPosition(left_position.0 + 1);
 540        previous_right_position = LocalWordPosition(right_position.0 + 1);
 541    }
 542    // Also recurse into range at end (after common ranges).
 543    collect_unchanged_words(
 544        found_positions,
 545        &left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())),
 546        &right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())),
 547        comp,
 548    );
 549}
 550
 551/// Intersects two sorted sequences of `(base, other)` word positions by
 552/// `base`. `base` positions should refer to the same source text.
 553fn intersect_unchanged_words(
 554    current_positions: Vec<(WordPosition, Vec<WordPosition>)>,
 555    new_positions: &[(WordPosition, WordPosition)],
 556) -> Vec<(WordPosition, Vec<WordPosition>)> {
 557    itertools::merge_join_by(
 558        current_positions,
 559        new_positions,
 560        |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos),
 561    )
 562    .filter_map(|entry| entry.both())
 563    .map(|((base_pos, mut other_positions), &(_, new_other_pos))| {
 564        other_positions.push(new_other_pos);
 565        (base_pos, other_positions)
 566    })
 567    .collect()
 568}
 569
 570#[derive(Clone, PartialEq, Eq, Debug)]
 571struct UnchangedRange {
 572    // Inline up to two sides (base + one other)
 573    base: Range<usize>,
 574    others: SmallVec<[Range<usize>; 1]>,
 575}
 576
 577impl UnchangedRange {
 578    /// Translates word positions to byte ranges in the source texts.
 579    fn from_word_positions(
 580        base_source: &DiffSource,
 581        other_sources: &[DiffSource],
 582        base_position: WordPosition,
 583        other_positions: &[WordPosition],
 584    ) -> Self {
 585        assert_eq!(other_sources.len(), other_positions.len());
 586        let base = base_source.range_at(base_position);
 587        let others = iter::zip(other_sources, other_positions)
 588            .map(|(source, pos)| source.range_at(*pos))
 589            .collect();
 590        UnchangedRange { base, others }
 591    }
 592
 593    fn is_all_empty(&self) -> bool {
 594        self.base.is_empty() && self.others.iter().all(|r| r.is_empty())
 595    }
 596}
 597
 598/// Takes any number of inputs and finds regions that are them same between all
 599/// of them.
 600#[derive(Clone, Debug)]
 601pub struct Diff<'input> {
 602    base_input: &'input BStr,
 603    other_inputs: SmallVec<[&'input BStr; 1]>,
 604    /// Sorted list of ranges of unchanged regions in bytes.
 605    ///
 606    /// The list should never be empty. The first and the last region may be
 607    /// empty if inputs start/end with changes.
 608    unchanged_regions: Vec<UnchangedRange>,
 609}
 610
 611impl<'input> Diff<'input> {
 612    pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
 613        inputs: impl IntoIterator<Item = &'input T>,
 614        tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
 615        compare: impl CompareBytes,
 616    ) -> Self {
 617        let mut inputs = inputs.into_iter().map(BStr::new);
 618        let base_input = inputs.next().expect("inputs must not be empty");
 619        let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect();
 620        // First tokenize each input
 621        let base_token_ranges: Vec<Range<usize>>;
 622        let other_token_ranges: Vec<Vec<Range<usize>>>;
 623        // No need to tokenize if one of the inputs is empty. Non-empty inputs
 624        // are all different as long as the tokenizer emits non-empty ranges.
 625        // This means "" and " " are different even if the compare function is
 626        // ignore-whitespace. They are tokenized as [] and [" "] respectively.
 627        if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) {
 628            base_token_ranges = vec![];
 629            other_token_ranges = std::iter::repeat_n(vec![], other_inputs.len()).collect();
 630        } else {
 631            base_token_ranges = tokenizer(base_input);
 632            other_token_ranges = other_inputs
 633                .iter()
 634                .map(|other_input| tokenizer(other_input))
 635                .collect();
 636        }
 637        Self::with_inputs_and_token_ranges(
 638            base_input,
 639            other_inputs,
 640            &base_token_ranges,
 641            &other_token_ranges,
 642            compare,
 643        )
 644    }
 645
 646    fn with_inputs_and_token_ranges(
 647        base_input: &'input BStr,
 648        other_inputs: SmallVec<[&'input BStr; 1]>,
 649        base_token_ranges: &[Range<usize>],
 650        other_token_ranges: &[Vec<Range<usize>>],
 651        compare: impl CompareBytes,
 652    ) -> Self {
 653        assert_eq!(other_inputs.len(), other_token_ranges.len());
 654        let comp = WordComparator::new(compare);
 655        let base_source = DiffSource::new(base_input, base_token_ranges, &comp);
 656        let other_sources = iter::zip(&other_inputs, other_token_ranges)
 657            .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp))
 658            .collect_vec();
 659        let unchanged_regions = match &*other_sources {
 660            // Consider the whole range of the base input as unchanged compared
 661            // to itself.
 662            [] => {
 663                let whole_range = UnchangedRange {
 664                    base: 0..base_source.text.len(),
 665                    others: smallvec![],
 666                };
 667                vec![whole_range]
 668            }
 669            // Diff each other input against the base. Intersect the previously
 670            // found ranges with the ranges in the diff.
 671            [first_other_source, tail_other_sources @ ..] => {
 672                let mut unchanged_regions = Vec::new();
 673                // Add an empty range at the start to make life easier for hunks().
 674                unchanged_regions.push(UnchangedRange {
 675                    base: 0..0,
 676                    others: smallvec![0..0; other_inputs.len()],
 677                });
 678                let mut first_positions = Vec::new();
 679                collect_unchanged_words(
 680                    &mut first_positions,
 681                    &base_source.local(),
 682                    &first_other_source.local(),
 683                    &comp,
 684                );
 685                if tail_other_sources.is_empty() {
 686                    unchanged_regions.extend(first_positions.iter().map(
 687                        |&(base_pos, other_pos)| {
 688                            UnchangedRange::from_word_positions(
 689                                &base_source,
 690                                &other_sources,
 691                                base_pos,
 692                                &[other_pos],
 693                            )
 694                        },
 695                    ));
 696                } else {
 697                    let first_positions = first_positions
 698                        .iter()
 699                        .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos]))
 700                        .collect();
 701                    let intersected_positions = tail_other_sources.iter().fold(
 702                        first_positions,
 703                        |current_positions, other_source| {
 704                            let mut new_positions = Vec::new();
 705                            collect_unchanged_words(
 706                                &mut new_positions,
 707                                &base_source.local(),
 708                                &other_source.local(),
 709                                &comp,
 710                            );
 711                            intersect_unchanged_words(current_positions, &new_positions)
 712                        },
 713                    );
 714                    unchanged_regions.extend(intersected_positions.iter().map(
 715                        |(base_pos, other_positions)| {
 716                            UnchangedRange::from_word_positions(
 717                                &base_source,
 718                                &other_sources,
 719                                *base_pos,
 720                                other_positions,
 721                            )
 722                        },
 723                    ));
 724                };
 725                // Add an empty range at the end to make life easier for hunks().
 726                unchanged_regions.push(UnchangedRange {
 727                    base: base_input.len()..base_input.len(),
 728                    others: other_inputs
 729                        .iter()
 730                        .map(|input| input.len()..input.len())
 731                        .collect(),
 732                });
 733                unchanged_regions
 734            }
 735        };
 736
 737        let mut diff = Self {
 738            base_input,
 739            other_inputs,
 740            unchanged_regions,
 741        };
 742        diff.compact_unchanged_regions();
 743        diff
 744    }
 745
 746    pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
 747        inputs: impl IntoIterator<Item = &'input T>,
 748    ) -> Self {
 749        Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly)
 750    }
 751
 752    /// Compares `inputs` line by line.
 753    pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
 754        inputs: impl IntoIterator<Item = &'input T>,
 755    ) -> Self {
 756        Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly)
 757    }
 758
 759    /// Compares `inputs` word by word.
 760    ///
 761    /// The `inputs` is usually a changed hunk (e.g. a `DiffHunk::Different`)
 762    /// that was the output from a line-by-line diff.
 763    pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>(
 764        inputs: impl IntoIterator<Item = &'input T>,
 765    ) -> Self {
 766        let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly);
 767        diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
 768        diff
 769    }
 770
 771    /// Returns iterator over matching and different texts.
 772    pub fn hunks(&self) -> DiffHunkIterator<'_, 'input> {
 773        let ranges = self.hunk_ranges();
 774        DiffHunkIterator { diff: self, ranges }
 775    }
 776
 777    /// Returns iterator over matching and different ranges in bytes.
 778    pub fn hunk_ranges(&self) -> DiffHunkRangeIterator<'_> {
 779        DiffHunkRangeIterator::new(self)
 780    }
 781
 782    /// Returns contents at the unchanged `range`.
 783    fn hunk_at<'a, 'b>(
 784        &'a self,
 785        range: &'b UnchangedRange,
 786    ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
 787        itertools::chain(
 788            iter::once(&self.base_input[range.base.clone()]),
 789            iter::zip(&self.other_inputs, &range.others).map(|(input, r)| &input[r.clone()]),
 790        )
 791    }
 792
 793    /// Returns contents between the `previous` ends and the `current` starts.
 794    fn hunk_between<'a, 'b>(
 795        &'a self,
 796        previous: &'b UnchangedRange,
 797        current: &'b UnchangedRange,
 798    ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> {
 799        itertools::chain(
 800            iter::once(&self.base_input[previous.base.end..current.base.start]),
 801            itertools::izip!(&self.other_inputs, &previous.others, &current.others)
 802                .map(|(input, prev, cur)| &input[prev.end..cur.start]),
 803        )
 804    }
 805
 806    /// Uses the given tokenizer to split the changed regions into smaller
 807    /// regions. Then tries to finds unchanged regions among them.
 808    pub fn refine_changed_regions(
 809        &mut self,
 810        tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
 811        compare: impl CompareBytes,
 812    ) {
 813        let mut new_unchanged_ranges = vec![self.unchanged_regions[0].clone()];
 814        for window in self.unchanged_regions.windows(2) {
 815            let [previous, current]: &[_; 2] = window.try_into().unwrap();
 816            // For the changed region between the previous region and the current one,
 817            // create a new Diff instance. Then adjust the start positions and
 818            // offsets to be valid in the context of the larger Diff instance
 819            // (`self`).
 820            let refined_diff =
 821                Diff::for_tokenizer(self.hunk_between(previous, current), &tokenizer, &compare);
 822            for refined in &refined_diff.unchanged_regions {
 823                let new_base_start = refined.base.start + previous.base.end;
 824                let new_base_end = refined.base.end + previous.base.end;
 825                let new_others = iter::zip(&refined.others, &previous.others)
 826                    .map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end))
 827                    .collect();
 828                new_unchanged_ranges.push(UnchangedRange {
 829                    base: new_base_start..new_base_end,
 830                    others: new_others,
 831                });
 832            }
 833            new_unchanged_ranges.push(current.clone());
 834        }
 835        self.unchanged_regions = new_unchanged_ranges;
 836        self.compact_unchanged_regions();
 837    }
 838
 839    fn compact_unchanged_regions(&mut self) {
 840        let mut compacted = vec![];
 841        let mut maybe_previous: Option<UnchangedRange> = None;
 842        for current in &self.unchanged_regions {
 843            if let Some(previous) = maybe_previous {
 844                if previous.base.end == current.base.start
 845                    && iter::zip(&previous.others, &current.others)
 846                        .all(|(prev, cur)| prev.end == cur.start)
 847                {
 848                    maybe_previous = Some(UnchangedRange {
 849                        base: previous.base.start..current.base.end,
 850                        others: iter::zip(&previous.others, &current.others)
 851                            .map(|(prev, cur)| prev.start..cur.end)
 852                            .collect(),
 853                    });
 854                    continue;
 855                }
 856                compacted.push(previous);
 857            }
 858            maybe_previous = Some(current.clone());
 859        }
 860        if let Some(previous) = maybe_previous {
 861            compacted.push(previous);
 862        }
 863        self.unchanged_regions = compacted;
 864    }
 865}
 866
 867/// Hunk texts.
 868#[derive(Clone, Debug, Eq, PartialEq)]
 869pub struct DiffHunk<'input> {
 870    pub kind: DiffHunkKind,
 871    pub contents: DiffHunkContentVec<'input>,
 872}
 873
 874impl<'input> DiffHunk<'input> {
 875    pub fn matching<T: AsRef<[u8]> + ?Sized + 'input>(
 876        contents: impl IntoIterator<Item = &'input T>,
 877    ) -> Self {
 878        DiffHunk {
 879            kind: DiffHunkKind::Matching,
 880            contents: contents.into_iter().map(BStr::new).collect(),
 881        }
 882    }
 883
 884    pub fn different<T: AsRef<[u8]> + ?Sized + 'input>(
 885        contents: impl IntoIterator<Item = &'input T>,
 886    ) -> Self {
 887        DiffHunk {
 888            kind: DiffHunkKind::Different,
 889            contents: contents.into_iter().map(BStr::new).collect(),
 890        }
 891    }
 892}
 893
 894#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 895pub enum DiffHunkKind {
 896    Matching,
 897    Different,
 898}
 899
 900// Inline up to two sides
 901pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>;
 902
 903/// Iterator over matching and different texts.
 904#[derive(Clone, Debug)]
 905pub struct DiffHunkIterator<'diff, 'input> {
 906    diff: &'diff Diff<'input>,
 907    ranges: DiffHunkRangeIterator<'diff>,
 908}
 909
 910impl<'input> Iterator for DiffHunkIterator<'_, 'input> {
 911    type Item = DiffHunk<'input>;
 912
 913    fn next(&mut self) -> Option<Self::Item> {
 914        self.ranges.next_with(
 915            |previous| {
 916                let contents = self.diff.hunk_at(previous).collect();
 917                let kind = DiffHunkKind::Matching;
 918                DiffHunk { kind, contents }
 919            },
 920            |previous, current| {
 921                let contents: DiffHunkContentVec =
 922                    self.diff.hunk_between(previous, current).collect();
 923                debug_assert!(
 924                    contents.iter().any(|content| !content.is_empty()),
 925                    "unchanged regions should have been compacted"
 926                );
 927                let kind = DiffHunkKind::Different;
 928                DiffHunk { kind, contents }
 929            },
 930        )
 931    }
 932}
 933
 934/// Hunk ranges in bytes.
 935#[derive(Clone, Debug, Eq, PartialEq)]
 936pub struct DiffHunkRange {
 937    pub kind: DiffHunkKind,
 938    pub ranges: DiffHunkRangeVec,
 939}
 940
 941// Inline up to two sides
 942pub type DiffHunkRangeVec = SmallVec<[Range<usize>; 2]>;
 943
 944/// Iterator over matching and different ranges in bytes.
 945#[derive(Clone, Debug)]
 946pub struct DiffHunkRangeIterator<'diff> {
 947    previous: &'diff UnchangedRange,
 948    unchanged_emitted: bool,
 949    unchanged_iter: slice::Iter<'diff, UnchangedRange>,
 950}
 951
 952impl<'diff> DiffHunkRangeIterator<'diff> {
 953    fn new(diff: &'diff Diff) -> Self {
 954        let mut unchanged_iter = diff.unchanged_regions.iter();
 955        let previous = unchanged_iter.next().unwrap();
 956        DiffHunkRangeIterator {
 957            previous,
 958            unchanged_emitted: previous.is_all_empty(),
 959            unchanged_iter,
 960        }
 961    }
 962
 963    fn next_with<T>(
 964        &mut self,
 965        hunk_at: impl FnOnce(&UnchangedRange) -> T,
 966        hunk_between: impl FnOnce(&UnchangedRange, &UnchangedRange) -> T,
 967    ) -> Option<T> {
 968        if !self.unchanged_emitted {
 969            self.unchanged_emitted = true;
 970            return Some(hunk_at(self.previous));
 971        }
 972        let current = self.unchanged_iter.next()?;
 973        let hunk = hunk_between(self.previous, current);
 974        self.previous = current;
 975        self.unchanged_emitted = self.previous.is_all_empty();
 976        Some(hunk)
 977    }
 978}
 979
 980impl Iterator for DiffHunkRangeIterator<'_> {
 981    type Item = DiffHunkRange;
 982
 983    fn next(&mut self) -> Option<Self::Item> {
 984        self.next_with(
 985            |previous| {
 986                let ranges = itertools::chain(iter::once(&previous.base), &previous.others)
 987                    .cloned()
 988                    .collect();
 989                let kind = DiffHunkKind::Matching;
 990                DiffHunkRange { kind, ranges }
 991            },
 992            |previous, current| {
 993                let ranges: DiffHunkRangeVec = itertools::chain(
 994                    iter::once(previous.base.end..current.base.start),
 995                    iter::zip(&previous.others, &current.others)
 996                        .map(|(prev, cur)| prev.end..cur.start),
 997                )
 998                .collect();
 999                debug_assert!(
1000                    ranges.iter().any(|range| !range.is_empty()),
1001                    "unchanged regions should have been compacted"
1002                );
1003                let kind = DiffHunkKind::Different;
1004                DiffHunkRange { kind, ranges }
1005            },
1006        )
1007    }
1008}
1009
1010/// Diffs slices of bytes.
1011///
1012/// The returned diff hunks may be any length (may span many lines or
1013/// may be only part of a line). This currently uses Histogram diff
1014/// (or maybe something similar; I'm not sure I understood the
1015/// algorithm correctly). It first diffs lines in the input and then
1016/// refines the changed ranges at the word level.
1017pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>(
1018    inputs: impl IntoIterator<Item = &'a T>,
1019) -> Vec<DiffHunk<'a>> {
1020    let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly);
1021    diff.refine_changed_regions(find_word_ranges, CompareBytesExactly);
1022    diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly);
1023    diff.hunks().collect()
1024}
1025
1026#[cfg(test)]
1027mod tests {
1028    use super::*;
1029
1030    // Extracted to a function because type inference is ambiguous due to
1031    // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>`
1032    fn no_ranges() -> Vec<Range<usize>> {
1033        vec![]
1034    }
1035
1036    #[test]
1037    fn test_find_line_ranges_empty() {
1038        assert_eq!(find_line_ranges(b""), no_ranges());
1039    }
1040
1041    #[test]
1042    fn test_find_line_ranges_blank_line() {
1043        assert_eq!(find_line_ranges(b"\n"), vec![0..1]);
1044    }
1045
1046    #[test]
1047    fn test_find_line_ranges_missing_newline_at_eof() {
1048        assert_eq!(find_line_ranges(b"foo"), vec![0..3]);
1049    }
1050
1051    #[test]
1052    fn test_find_line_ranges_multiple_lines() {
1053        assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]);
1054    }
1055
1056    #[test]
1057    fn test_find_word_ranges_empty() {
1058        assert_eq!(find_word_ranges(b""), no_ranges());
1059    }
1060
1061    #[test]
1062    fn test_find_word_ranges_single_word() {
1063        assert_eq!(find_word_ranges(b"Abc"), vec![0..3]);
1064    }
1065
1066    #[test]
1067    fn test_find_word_ranges_no_word() {
1068        assert_eq!(find_word_ranges(b"+-*/"), no_ranges());
1069    }
1070
1071    #[test]
1072    fn test_find_word_ranges_word_then_non_word() {
1073        assert_eq!(find_word_ranges(b"Abc   "), vec![0..3]);
1074    }
1075
1076    #[test]
1077    fn test_find_word_ranges_non_word_then_word() {
1078        assert_eq!(find_word_ranges(b"   Abc"), vec![3..6]);
1079    }
1080
1081    #[test]
1082    fn test_find_word_ranges_multibyte() {
1083        assert_eq!(find_word_ranges("⊢".as_bytes()), vec![0..3]);
1084    }
1085
1086    #[test]
1087    fn test_find_lcs_empty() {
1088        let empty: Vec<(usize, usize)> = vec![];
1089        assert_eq!(find_lcs(&[]), empty);
1090    }
1091
1092    #[test]
1093    fn test_find_lcs_single_element() {
1094        assert_eq!(find_lcs(&[0]), vec![(0, 0)]);
1095    }
1096
1097    #[test]
1098    fn test_find_lcs_in_order() {
1099        assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]);
1100    }
1101
1102    #[test]
1103    fn test_find_lcs_reverse_order() {
1104        assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]);
1105    }
1106
1107    #[test]
1108    fn test_find_lcs_two_swapped() {
1109        assert_eq!(
1110            find_lcs(&[0, 1, 4, 3, 2, 5, 6]),
1111            vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)]
1112        );
1113    }
1114
1115    #[test]
1116    fn test_find_lcs_element_moved_earlier() {
1117        assert_eq!(
1118            find_lcs(&[0, 1, 4, 2, 3, 5, 6]),
1119            vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)]
1120        );
1121    }
1122
1123    #[test]
1124    fn test_find_lcs_element_moved_later() {
1125        assert_eq!(
1126            find_lcs(&[0, 1, 3, 4, 2, 5, 6]),
1127            vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)]
1128        );
1129    }
1130
1131    #[test]
1132    fn test_find_lcs_interleaved_longest_chains() {
1133        assert_eq!(
1134            find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]),
1135            vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)]
1136        );
1137    }
1138
1139    #[test]
1140    fn test_find_word_ranges_many_words() {
1141        assert_eq!(
1142            find_word_ranges(b"fn find_words(text: &[u8])"),
1143            vec![0..2, 3..13, 14..18, 22..24]
1144        );
1145    }
1146
1147    #[test]
1148    fn test_compare_bytes_ignore_all_whitespace() {
1149        let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace);
1150        let hash = |data: &[u8]| comp.hash_one(data);
1151
1152        assert!(comp.eq(b"", b""));
1153        assert!(comp.eq(b"", b" "));
1154        assert!(comp.eq(b"\t", b"\r"));
1155        assert_eq!(hash(b""), hash(b""));
1156        assert_eq!(hash(b""), hash(b" "));
1157        assert_eq!(hash(b""), hash(b"\t"));
1158        assert_eq!(hash(b""), hash(b"\r"));
1159
1160        assert!(comp.eq(b"ab", b" a  b\t"));
1161        assert_eq!(hash(b"ab"), hash(b" a  b\t"));
1162
1163        assert!(!comp.eq(b"a", b""));
1164        assert!(!comp.eq(b"a", b" "));
1165        assert!(!comp.eq(b"a", b"ab"));
1166        assert!(!comp.eq(b"ab", b"ba"));
1167    }
1168
1169    #[test]
1170    fn test_compare_bytes_ignore_whitespace_amount() {
1171        let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount);
1172        let hash = |data: &[u8]| comp.hash_one(data);
1173
1174        assert!(comp.eq(b"", b""));
1175        assert!(comp.eq(b"\n", b" \n"));
1176        assert!(comp.eq(b"\t", b"\r"));
1177        assert_eq!(hash(b""), hash(b""));
1178        assert_eq!(hash(b" "), hash(b"\n"));
1179        assert_eq!(hash(b" "), hash(b" \n"));
1180        assert_eq!(hash(b" "), hash(b"\t"));
1181        assert_eq!(hash(b" "), hash(b"\r"));
1182
1183        assert!(comp.eq(b"a b c\n", b"a  b\tc\r\n"));
1184        assert_eq!(hash(b"a b c\n"), hash(b"a  b\tc\r\n"));
1185
1186        assert!(!comp.eq(b"", b" "));
1187        assert!(!comp.eq(b"a", b""));
1188        assert!(!comp.eq(b"a", b" "));
1189        assert!(!comp.eq(b"a", b"a "));
1190        assert!(!comp.eq(b"a", b" a"));
1191        assert!(!comp.eq(b"a", b"ab"));
1192        assert!(!comp.eq(b"ab", b"ba"));
1193        assert!(!comp.eq(b"ab", b"a b"));
1194    }
1195
1196    fn unchanged_ranges(
1197        (left_text, left_ranges): (&[u8], &[Range<usize>]),
1198        (right_text, right_ranges): (&[u8], &[Range<usize>]),
1199    ) -> Vec<(Range<usize>, Range<usize>)> {
1200        let comp = WordComparator::new(CompareBytesExactly);
1201        let left = DiffSource::new(left_text, left_ranges, &comp);
1202        let right = DiffSource::new(right_text, right_ranges, &comp);
1203        let mut positions = Vec::new();
1204        collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp);
1205        positions
1206            .into_iter()
1207            .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos)))
1208            .collect()
1209    }
1210
1211    #[test]
1212    fn test_unchanged_ranges_insert_in_middle() {
1213        assert_eq!(
1214            unchanged_ranges(
1215                (b"a b b c", &[0..1, 2..3, 4..5, 6..7]),
1216                (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1217            ),
1218            vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)]
1219        );
1220    }
1221
1222    #[test]
1223    fn test_unchanged_ranges_non_unique_removed() {
1224        // We used to consider the first two "a" in the first input to match the two
1225        // "a"s in the second input. We no longer do.
1226        assert_eq!(
1227            unchanged_ranges(
1228                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1229                (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1230            ),
1231            vec![(0..1, 0..1)]
1232        );
1233        assert_eq!(
1234            unchanged_ranges(
1235                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1236                (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1237            ),
1238            vec![(6..7, 6..7)]
1239        );
1240        assert_eq!(
1241            unchanged_ranges(
1242                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1243                (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1244            ),
1245            vec![]
1246        );
1247        assert_eq!(
1248            unchanged_ranges(
1249                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1250                (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1251            ),
1252            vec![(0..1, 0..1), (6..7, 6..7)]
1253        );
1254    }
1255
1256    #[test]
1257    fn test_unchanged_ranges_non_unique_added() {
1258        // We used to consider the first two "a" in the first input to match the two
1259        // "a"s in the second input. We no longer do.
1260        assert_eq!(
1261            unchanged_ranges(
1262                (b"a b a c", &[0..1, 2..3, 4..5, 6..7]),
1263                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1264            ),
1265            vec![(0..1, 0..1)]
1266        );
1267        assert_eq!(
1268            unchanged_ranges(
1269                (b"b a c a", &[0..1, 2..3, 4..5, 6..7]),
1270                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1271            ),
1272            vec![(6..7, 6..7)]
1273        );
1274        assert_eq!(
1275            unchanged_ranges(
1276                (b"b a a c", &[0..1, 2..3, 4..5, 6..7]),
1277                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1278            ),
1279            vec![]
1280        );
1281        assert_eq!(
1282            unchanged_ranges(
1283                (b"a b c a", &[0..1, 2..3, 4..5, 6..7]),
1284                (b"a a a a", &[0..1, 2..3, 4..5, 6..7]),
1285            ),
1286            vec![(0..1, 0..1), (6..7, 6..7)]
1287        );
1288    }
1289
1290    #[test]
1291    fn test_unchanged_ranges_recursion_needed() {
1292        // "|" matches first, then "b" matches within the left/right range.
1293        assert_eq!(
1294            unchanged_ranges(
1295                (b"a b | b", &[0..1, 2..3, 4..5, 6..7]),
1296                (b"b c d |", &[0..1, 2..3, 4..5, 6..7]),
1297            ),
1298            vec![(2..3, 0..1), (4..5, 6..7)]
1299        );
1300        assert_eq!(
1301            unchanged_ranges(
1302                (b"| b c d", &[0..1, 2..3, 4..5, 6..7]),
1303                (b"b | a b", &[0..1, 2..3, 4..5, 6..7]),
1304            ),
1305            vec![(0..1, 2..3), (2..3, 6..7)]
1306        );
1307        // "|" matches first, then the middle range is trimmed.
1308        assert_eq!(
1309            unchanged_ranges(
1310                (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1311                (b"| b b |", &[0..1, 2..3, 4..5, 6..7]),
1312            ),
1313            vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)]
1314        );
1315        assert_eq!(
1316            unchanged_ranges(
1317                (b"| c c |", &[0..1, 2..3, 4..5, 6..7]),
1318                (b"| b c |", &[0..1, 2..3, 4..5, 6..7]),
1319            ),
1320            vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)]
1321        );
1322        // "|" matches first, then "a", then "b".
1323        assert_eq!(
1324            unchanged_ranges(
1325                (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1326                (b"b a b |", &[0..1, 2..3, 4..5, 6..7]),
1327            ),
1328            vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)]
1329        );
1330        assert_eq!(
1331            unchanged_ranges(
1332                (b"| b a b", &[0..1, 2..3, 4..5, 6..7]),
1333                (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]),
1334            ),
1335            vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)]
1336        );
1337    }
1338
1339    #[test]
1340    fn test_diff_single_input() {
1341        assert_eq!(diff(["abc"]), vec![DiffHunk::matching(["abc"])]);
1342    }
1343
1344    #[test]
1345    fn test_diff_some_empty_inputs() {
1346        // All empty
1347        assert_eq!(diff([""]), vec![]);
1348        assert_eq!(diff(["", ""]), vec![]);
1349        assert_eq!(diff(["", "", ""]), vec![]);
1350
1351        // One empty
1352        assert_eq!(diff(["a b", ""]), vec![DiffHunk::different(["a b", ""])]);
1353        assert_eq!(diff(["", "a b"]), vec![DiffHunk::different(["", "a b"])]);
1354
1355        // One empty, two match
1356        assert_eq!(
1357            diff(["a b", "", "a b"]),
1358            vec![DiffHunk::different(["a b", "", "a b"])]
1359        );
1360        assert_eq!(
1361            diff(["", "a b", "a b"]),
1362            vec![DiffHunk::different(["", "a b", "a b"])]
1363        );
1364
1365        // Two empty, one differs
1366        assert_eq!(
1367            diff(["a b", "", ""]),
1368            vec![DiffHunk::different(["a b", "", ""])]
1369        );
1370        assert_eq!(
1371            diff(["", "a b", ""]),
1372            vec![DiffHunk::different(["", "a b", ""])]
1373        );
1374    }
1375
1376    #[test]
1377    fn test_diff_two_inputs_one_different() {
1378        assert_eq!(
1379            diff(["a b c", "a X c"]),
1380            vec![
1381                DiffHunk::matching(["a "].repeat(2)),
1382                DiffHunk::different(["b", "X"]),
1383                DiffHunk::matching([" c"].repeat(2)),
1384            ]
1385        );
1386    }
1387
1388    #[test]
1389    fn test_diff_multiple_inputs_one_different() {
1390        assert_eq!(
1391            diff(["a b c", "a X c", "a b c"]),
1392            vec![
1393                DiffHunk::matching(["a "].repeat(3)),
1394                DiffHunk::different(["b", "X", "b"]),
1395                DiffHunk::matching([" c"].repeat(3)),
1396            ]
1397        );
1398    }
1399
1400    #[test]
1401    fn test_diff_multiple_inputs_all_different() {
1402        assert_eq!(
1403            diff(["a b c", "a X c", "a c X"]),
1404            vec![
1405                DiffHunk::matching(["a "].repeat(3)),
1406                DiffHunk::different(["b ", "X ", ""]),
1407                DiffHunk::matching(["c"].repeat(3)),
1408                DiffHunk::different(["", "", " X"]),
1409            ]
1410        );
1411    }
1412
1413    #[test]
1414    fn test_diff_for_tokenizer_compacted() {
1415        // Tests that unchanged regions are compacted when using for_tokenizer()
1416        let diff = Diff::for_tokenizer(
1417            ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
1418            find_line_ranges,
1419            CompareBytesExactly,
1420        );
1421        assert_eq!(
1422            diff.hunks().collect_vec(),
1423            vec![
1424                DiffHunk::matching(["a\nb\nc\n"].repeat(2)),
1425                DiffHunk::different(["d\n", "X\n"]),
1426                DiffHunk::matching(["e\nf\ng"].repeat(2)),
1427            ]
1428        );
1429    }
1430
1431    #[test]
1432    fn test_diff_nothing_in_common() {
1433        assert_eq!(
1434            diff(["aaa", "bb"]),
1435            vec![DiffHunk::different(["aaa", "bb"])]
1436        );
1437    }
1438
1439    #[test]
1440    fn test_diff_insert_in_middle() {
1441        assert_eq!(
1442            diff(["a z", "a S z"]),
1443            vec![
1444                DiffHunk::matching(["a "].repeat(2)),
1445                DiffHunk::different(["", "S "]),
1446                DiffHunk::matching(["z"].repeat(2)),
1447            ]
1448        );
1449    }
1450
1451    #[test]
1452    fn test_diff_no_unique_middle_flips() {
1453        assert_eq!(
1454            diff(["a R R S S z", "a S S R R z"]),
1455            vec![
1456                DiffHunk::matching(["a "].repeat(2)),
1457                DiffHunk::different(["R R ", ""]),
1458                DiffHunk::matching(["S S "].repeat(2)),
1459                DiffHunk::different(["", "R R "]),
1460                DiffHunk::matching(["z"].repeat(2))
1461            ],
1462        );
1463    }
1464
1465    #[test]
1466    fn test_diff_recursion_needed() {
1467        assert_eq!(
1468            diff([
1469                "a q x q y q z q b q y q x q c",
1470                "a r r x q y z q b y q x r r c",
1471            ]),
1472            vec![
1473                DiffHunk::matching(["a "].repeat(2)),
1474                DiffHunk::different(["q", "r"]),
1475                DiffHunk::matching([" "].repeat(2)),
1476                DiffHunk::different(["", "r "]),
1477                DiffHunk::matching(["x q y "].repeat(2)),
1478                DiffHunk::different(["q ", ""]),
1479                DiffHunk::matching(["z q b "].repeat(2)),
1480                DiffHunk::different(["q ", ""]),
1481                DiffHunk::matching(["y q x "].repeat(2)),
1482                DiffHunk::different(["q", "r"]),
1483                DiffHunk::matching([" "].repeat(2)),
1484                DiffHunk::different(["", "r "]),
1485                DiffHunk::matching(["c"].repeat(2)),
1486            ]
1487        );
1488    }
1489
1490    #[test]
1491    fn test_diff_ignore_all_whitespace() {
1492        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1493            let diff =
1494                Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace);
1495            diff.hunks().collect()
1496        }
1497
1498        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1499        assert_eq!(
1500            diff(["a\n", " a\r\n"]),
1501            vec![DiffHunk::matching(["a\n", " a\r\n"])]
1502        );
1503        assert_eq!(
1504            diff(["a\n", " a\nb"]),
1505            vec![
1506                DiffHunk::matching(["a\n", " a\n"]),
1507                DiffHunk::different(["", "b"]),
1508            ]
1509        );
1510
1511        // No LCS matches, so trim leading/trailing common lines
1512        assert_eq!(
1513            diff(["a\nc\n", " a\n a\n"]),
1514            vec![
1515                DiffHunk::matching(["a\n", " a\n"]),
1516                DiffHunk::different(["c\n", " a\n"]),
1517            ]
1518        );
1519        assert_eq!(
1520            diff(["c\na\n", " a\n a\n"]),
1521            vec![
1522                DiffHunk::different(["c\n", " a\n"]),
1523                DiffHunk::matching(["a\n", " a\n"]),
1524            ]
1525        );
1526    }
1527
1528    #[test]
1529    fn test_diff_ignore_whitespace_amount() {
1530        fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> {
1531            let diff =
1532                Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount);
1533            diff.hunks().collect()
1534        }
1535
1536        assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]);
1537        // whitespace at line end is ignored
1538        assert_eq!(
1539            diff(["a\n", "a\r\n"]),
1540            vec![DiffHunk::matching(["a\n", "a\r\n"])]
1541        );
1542        // but whitespace at line start isn't
1543        assert_eq!(
1544            diff(["a\n", " a\n"]),
1545            vec![DiffHunk::different(["a\n", " a\n"])]
1546        );
1547        assert_eq!(
1548            diff(["a\n", "a \nb"]),
1549            vec![
1550                DiffHunk::matching(["a\n", "a \n"]),
1551                DiffHunk::different(["", "b"]),
1552            ]
1553        );
1554    }
1555
1556    #[test]
1557    fn test_diff_hunk_iterator() {
1558        let diff = Diff::by_word(["a b c", "a XX c", "a b "]);
1559        assert_eq!(
1560            diff.hunks().collect_vec(),
1561            vec![
1562                DiffHunk::matching(["a "].repeat(3)),
1563                DiffHunk::different(["b", "XX", "b"]),
1564                DiffHunk::matching([" "].repeat(3)),
1565                DiffHunk::different(["c", "c", ""]),
1566            ]
1567        );
1568        assert_eq!(
1569            diff.hunk_ranges().collect_vec(),
1570            vec![
1571                DiffHunkRange {
1572                    kind: DiffHunkKind::Matching,
1573                    ranges: smallvec![0..2, 0..2, 0..2],
1574                },
1575                DiffHunkRange {
1576                    kind: DiffHunkKind::Different,
1577                    ranges: smallvec![2..3, 2..4, 2..3],
1578                },
1579                DiffHunkRange {
1580                    kind: DiffHunkKind::Matching,
1581                    ranges: smallvec![3..4, 4..5, 3..4],
1582                },
1583                DiffHunkRange {
1584                    kind: DiffHunkKind::Different,
1585                    ranges: smallvec![4..5, 5..6, 4..4],
1586                },
1587            ]
1588        );
1589    }
1590
1591    #[test]
1592    fn test_diff_real_case_write_fmt() {
1593        // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
1594        // need for recursion into the range at the end: after splitting at "Arguments"
1595        // and "formatter", the region at the end has the unique words "write_fmt"
1596        // and "fmt", but we forgot to recurse into that region, so we ended up
1597        // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
1598        #[rustfmt::skip]
1599        assert_eq!(
1600            diff([
1601                "    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n        self.styler().write_fmt(fmt).unwrap()\n",
1602                "    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n        self.styler().write_fmt(fmt)\n"
1603            ]),
1604            vec![
1605                DiffHunk::matching(["    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "].repeat(2)),
1606                DiffHunk::different(["", "-> io::Result<()> "]),
1607                DiffHunk::matching(["{\n        self.styler().write_fmt(fmt)"].repeat(2)),
1608                DiffHunk::different([".unwrap()", ""]),
1609                DiffHunk::matching(["\n"].repeat(2))
1610            ]
1611        );
1612    }
1613
1614    #[test]
1615    fn test_diff_real_case_gitgit_read_tree_c() {
1616        // This is the diff from commit e497ea2a9b in the git.git repo
1617        #[rustfmt::skip]
1618        assert_eq!(
1619            diff([
1620                r##"/*
1621 * GIT - The information manager from hell
1622 *
1623 * Copyright (C) Linus Torvalds, 2005
1624 */
1625#include "#cache.h"
1626
1627static int unpack(unsigned char *sha1)
1628{
1629	void *buffer;
1630	unsigned long size;
1631	char type[20];
1632
1633	buffer = read_sha1_file(sha1, type, &size);
1634	if (!buffer)
1635		usage("unable to read sha1 file");
1636	if (strcmp(type, "tree"))
1637		usage("expected a 'tree' node");
1638	while (size) {
1639		int len = strlen(buffer)+1;
1640		unsigned char *sha1 = buffer + len;
1641		char *path = strchr(buffer, ' ')+1;
1642		unsigned int mode;
1643		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1644			usage("corrupt 'tree' file");
1645		buffer = sha1 + 20;
1646		size -= len + 20;
1647		printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1));
1648	}
1649	return 0;
1650}
1651
1652int main(int argc, char **argv)
1653{
1654	int fd;
1655	unsigned char sha1[20];
1656
1657	if (argc != 2)
1658		usage("read-tree <key>");
1659	if (get_sha1_hex(argv[1], sha1) < 0)
1660		usage("read-tree <key>");
1661	sha1_file_directory = getenv(DB_ENVIRONMENT);
1662	if (!sha1_file_directory)
1663		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1664	if (unpack(sha1) < 0)
1665		usage("unpack failed");
1666	return 0;
1667}
1668"##,
1669                r##"/*
1670 * GIT - The information manager from hell
1671 *
1672 * Copyright (C) Linus Torvalds, 2005
1673 */
1674#include "#cache.h"
1675
1676static void create_directories(const char *path)
1677{
1678	int len = strlen(path);
1679	char *buf = malloc(len + 1);
1680	const char *slash = path;
1681
1682	while ((slash = strchr(slash+1, '/')) != NULL) {
1683		len = slash - path;
1684		memcpy(buf, path, len);
1685		buf[len] = 0;
1686		mkdir(buf, 0700);
1687	}
1688}
1689
1690static int create_file(const char *path)
1691{
1692	int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1693	if (fd < 0) {
1694		if (errno == ENOENT) {
1695			create_directories(path);
1696			fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);
1697		}
1698	}
1699	return fd;
1700}
1701
1702static int unpack(unsigned char *sha1)
1703{
1704	void *buffer;
1705	unsigned long size;
1706	char type[20];
1707
1708	buffer = read_sha1_file(sha1, type, &size);
1709	if (!buffer)
1710		usage("unable to read sha1 file");
1711	if (strcmp(type, "tree"))
1712		usage("expected a 'tree' node");
1713	while (size) {
1714		int len = strlen(buffer)+1;
1715		unsigned char *sha1 = buffer + len;
1716		char *path = strchr(buffer, ' ')+1;
1717		char *data;
1718		unsigned long filesize;
1719		unsigned int mode;
1720		int fd;
1721
1722		if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1)
1723			usage("corrupt 'tree' file");
1724		buffer = sha1 + 20;
1725		size -= len + 20;
1726		data = read_sha1_file(sha1, type, &filesize);
1727		if (!data || strcmp(type, "blob"))
1728			usage("tree file refers to bad file data");
1729		fd = create_file(path);
1730		if (fd < 0)
1731			usage("unable to create file");
1732		if (write(fd, data, filesize) != filesize)
1733			usage("unable to write file");
1734		fchmod(fd, mode);
1735		close(fd);
1736		free(data);
1737	}
1738	return 0;
1739}
1740
1741int main(int argc, char **argv)
1742{
1743	int fd;
1744	unsigned char sha1[20];
1745
1746	if (argc != 2)
1747		usage("read-tree <key>");
1748	if (get_sha1_hex(argv[1], sha1) < 0)
1749		usage("read-tree <key>");
1750	sha1_file_directory = getenv(DB_ENVIRONMENT);
1751	if (!sha1_file_directory)
1752		sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
1753	if (unpack(sha1) < 0)
1754		usage("unpack failed");
1755	return 0;
1756}
1757"##,
1758            ]),
1759            vec![
1760               DiffHunk::matching(["/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"].repeat(2)),
1761               DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]),
1762               DiffHunk::matching(["static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"].repeat(2)),
1763               DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]),
1764               DiffHunk::matching(["\t\tunsigned int mode;\n"].repeat(2)),
1765               DiffHunk::different(["", "\t\tint fd;\n\n"]),
1766               DiffHunk::matching(["\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"].repeat(2)),
1767               DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]),
1768               DiffHunk::matching([" "].repeat(2)),
1769               DiffHunk::different(["sha1_to_hex", "read_sha1_file"]),
1770               DiffHunk::matching(["(sha1"].repeat(2)),
1771               DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]),
1772               DiffHunk::matching([");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"].repeat(2)),
1773            ]
1774        );
1775    }
1776}