just playing with tangled
at globpattern 1776 lines 62 kB view raw
1// Copyright 2021 The Jujutsu Authors 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15#![allow(missing_docs)] 16 17use std::collections::BTreeMap; 18use std::hash::BuildHasher; 19use std::hash::Hash; 20use std::hash::Hasher; 21use std::hash::RandomState; 22use std::iter; 23use std::ops::Range; 24use std::slice; 25 26use bstr::BStr; 27use hashbrown::HashTable; 28use itertools::Itertools as _; 29use smallvec::smallvec; 30use smallvec::SmallVec; 31 32pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> { 33 text.split_inclusive(|b| *b == b'\n') 34 .scan(0, |total, line| { 35 let start = *total; 36 *total += line.len(); 37 Some(start..*total) 38 }) 39 .collect() 40} 41 42fn is_word_byte(b: u8) -> bool { 43 // TODO: Make this configurable (probably higher up in the call stack) 44 matches!( 45 b, 46 // Count 0x80..0xff as word bytes so multi-byte UTF-8 chars are 47 // treated as a single unit. 48 b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'\x80'..=b'\xff' 49 ) 50} 51 52pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> { 53 let mut word_ranges = vec![]; 54 let mut word_start_pos = 0; 55 let mut in_word = false; 56 for (i, b) in text.iter().enumerate() { 57 if in_word && !is_word_byte(*b) { 58 in_word = false; 59 word_ranges.push(word_start_pos..i); 60 word_start_pos = i; 61 } else if !in_word && is_word_byte(*b) { 62 in_word = true; 63 word_start_pos = i; 64 } 65 } 66 if in_word && word_start_pos < text.len() { 67 word_ranges.push(word_start_pos..text.len()); 68 } 69 word_ranges 70} 71 72pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> { 73 text.iter() 74 .positions(|b| !is_word_byte(*b)) 75 .map(|i| i..i + 1) 76 .collect() 77} 78 79fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> { 80 text.iter().copied().filter(|b| !b.is_ascii_whitespace()) 81} 82 83fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + use<'_> { 84 let mut prev_was_space = false; 85 text.iter().filter_map(move |&b| { 86 let was_space = prev_was_space; 87 let is_space = b.is_ascii_whitespace(); 88 prev_was_space = is_space; 89 match (was_space, is_space) { 90 (_, false) => Some(b), 91 (false, true) => Some(b' '), 92 (true, true) => None, 93 } 94 }) 95} 96 97fn hash_with_length_suffix<I, H>(data: I, state: &mut H) 98where 99 I: IntoIterator, 100 I::Item: Hash, 101 H: Hasher, 102{ 103 let mut len: usize = 0; 104 for d in data { 105 d.hash(state); 106 len += 1; 107 } 108 state.write_usize(len); 109} 110 111/// Compares byte sequences based on a certain equivalence property. 112/// 113/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison 114/// object for the following reasons: 115/// 116/// a. If it were newtype, a generic `wrap` function would be needed. It 117/// couldn't be expressed as a simple closure: 118/// `for<'a> Fn(&'a [u8]) -> ???<'a>` 119/// b. Dynamic comparison object can be implemented intuitively. For example, 120/// `pattern: &Regex` would have to be copied to all newtype instances if it 121/// were newtype. 122/// c. Hash values can be cached if hashing is controlled externally. 123pub trait CompareBytes { 124 /// Returns true if `left` and `right` are equivalent. 125 fn eq(&self, left: &[u8], right: &[u8]) -> bool; 126 127 /// Generates hash which respects the following property: 128 /// `eq(left, right) => hash(left) == hash(right)` 129 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H); 130} 131 132// An instance might have e.g. Regex pattern, which can't be trivially copied. 133// Such comparison object can be passed by reference. 134impl<C: CompareBytes + ?Sized> CompareBytes for &C { 135 fn eq(&self, left: &[u8], right: &[u8]) -> bool { 136 <C as CompareBytes>::eq(self, left, right) 137 } 138 139 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) { 140 <C as CompareBytes>::hash(self, text, state); 141 } 142} 143 144/// Compares byte sequences literally. 145#[derive(Clone, Debug, Default)] 146pub struct CompareBytesExactly; 147 148impl CompareBytes for CompareBytesExactly { 149 fn eq(&self, left: &[u8], right: &[u8]) -> bool { 150 left == right 151 } 152 153 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) { 154 text.hash(state); 155 } 156} 157 158/// Compares byte sequences ignoring any whitespace occurrences. 159#[derive(Clone, Debug, Default)] 160pub struct CompareBytesIgnoreAllWhitespace; 161 162impl CompareBytes for CompareBytesIgnoreAllWhitespace { 163 fn eq(&self, left: &[u8], right: &[u8]) -> bool { 164 bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right)) 165 } 166 167 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) { 168 hash_with_length_suffix(bytes_ignore_all_whitespace(text), state); 169 } 170} 171 172/// Compares byte sequences ignoring changes in whitespace amount. 173#[derive(Clone, Debug, Default)] 174pub struct CompareBytesIgnoreWhitespaceAmount; 175 176impl CompareBytes for CompareBytesIgnoreWhitespaceAmount { 177 fn eq(&self, left: &[u8], right: &[u8]) -> bool { 178 bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right)) 179 } 180 181 fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) { 182 hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state); 183 } 184} 185 186// Not implementing Eq because the text should be compared by WordComparator. 187#[derive(Clone, Copy, Debug)] 188struct HashedWord<'input> { 189 hash: u64, 190 text: &'input BStr, 191} 192 193/// Compares words (or tokens) under a certain hasher configuration. 194#[derive(Clone, Debug, Default)] 195struct WordComparator<C, S> { 196 compare: C, 197 hash_builder: S, 198} 199 200impl<C: CompareBytes> WordComparator<C, RandomState> { 201 fn new(compare: C) -> Self { 202 WordComparator { 203 compare, 204 // TODO: switch to ahash for better performance? 205 hash_builder: RandomState::new(), 206 } 207 } 208} 209 210impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> { 211 fn eq(&self, left: &[u8], right: &[u8]) -> bool { 212 self.compare.eq(left, right) 213 } 214 215 fn eq_hashed(&self, left: HashedWord<'_>, right: HashedWord<'_>) -> bool { 216 left.hash == right.hash && self.compare.eq(left.text, right.text) 217 } 218 219 fn hash_one(&self, text: &[u8]) -> u64 { 220 let mut state = self.hash_builder.build_hasher(); 221 self.compare.hash(text, &mut state); 222 state.finish() 223 } 224} 225 226/// Index in a list of word (or token) ranges in `DiffSource`. 227#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 228struct WordPosition(usize); 229 230/// Index in a list of word (or token) ranges in `LocalDiffSource`. 231#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] 232struct LocalWordPosition(usize); 233 234#[derive(Clone, Debug)] 235struct DiffSource<'input, 'aux> { 236 text: &'input BStr, 237 ranges: &'aux [Range<usize>], 238 hashes: Vec<u64>, 239} 240 241impl<'input, 'aux> DiffSource<'input, 'aux> { 242 fn new<T: AsRef<[u8]> + ?Sized, C: CompareBytes, S: BuildHasher>( 243 text: &'input T, 244 ranges: &'aux [Range<usize>], 245 comp: &WordComparator<C, S>, 246 ) -> Self { 247 let text = BStr::new(text); 248 let hashes = ranges 249 .iter() 250 .map(|range| comp.hash_one(&text[range.clone()])) 251 .collect(); 252 DiffSource { 253 text, 254 ranges, 255 hashes, 256 } 257 } 258 259 fn local(&self) -> LocalDiffSource<'input, '_> { 260 LocalDiffSource { 261 text: self.text, 262 ranges: self.ranges, 263 hashes: &self.hashes, 264 global_offset: WordPosition(0), 265 } 266 } 267 268 fn range_at(&self, position: WordPosition) -> Range<usize> { 269 self.ranges[position.0].clone() 270 } 271} 272 273#[derive(Clone, Debug)] 274struct LocalDiffSource<'input, 'aux> { 275 text: &'input BStr, 276 ranges: &'aux [Range<usize>], 277 hashes: &'aux [u64], 278 /// The number of preceding word ranges excluded from the self `ranges`. 279 global_offset: WordPosition, 280} 281 282impl<'input> LocalDiffSource<'input, '_> { 283 fn narrowed(&self, positions: Range<LocalWordPosition>) -> Self { 284 LocalDiffSource { 285 text: self.text, 286 ranges: &self.ranges[positions.start.0..positions.end.0], 287 hashes: &self.hashes[positions.start.0..positions.end.0], 288 global_offset: self.map_to_global(positions.start), 289 } 290 } 291 292 fn map_to_global(&self, position: LocalWordPosition) -> WordPosition { 293 WordPosition(self.global_offset.0 + position.0) 294 } 295 296 fn hashed_words( 297 &self, 298 ) -> impl DoubleEndedIterator<Item = HashedWord<'input>> + ExactSizeIterator + use<'_, 'input> 299 { 300 iter::zip(self.ranges, self.hashes).map(|(range, &hash)| { 301 let text = &self.text[range.clone()]; 302 HashedWord { hash, text } 303 }) 304 } 305} 306 307struct Histogram<'input> { 308 word_to_positions: HashTable<HistogramEntry<'input>>, 309} 310 311// Many of the words are unique. We can inline up to 2 word positions (16 bytes 312// on 64-bit platform) in SmallVec for free. 313type HistogramEntry<'input> = (HashedWord<'input>, SmallVec<[LocalWordPosition; 2]>); 314 315impl<'input> Histogram<'input> { 316 fn calculate<C: CompareBytes, S: BuildHasher>( 317 source: &LocalDiffSource<'input, '_>, 318 comp: &WordComparator<C, S>, 319 max_occurrences: usize, 320 ) -> Self { 321 let mut word_to_positions: HashTable<HistogramEntry> = HashTable::new(); 322 for (i, word) in source.hashed_words().enumerate() { 323 let pos = LocalWordPosition(i); 324 word_to_positions 325 .entry( 326 word.hash, 327 |(w, _)| comp.eq(w.text, word.text), 328 |(w, _)| w.hash, 329 ) 330 .and_modify(|(_, positions)| { 331 // Allow one more than max_occurrences, so we can later skip 332 // those with more than max_occurrences 333 if positions.len() <= max_occurrences { 334 positions.push(pos); 335 } 336 }) 337 .or_insert_with(|| (word, smallvec![pos])); 338 } 339 Histogram { word_to_positions } 340 } 341 342 fn build_count_to_entries(&self) -> BTreeMap<usize, Vec<&HistogramEntry<'input>>> { 343 let mut count_to_entries: BTreeMap<usize, Vec<_>> = BTreeMap::new(); 344 for entry in &self.word_to_positions { 345 let (_, positions) = entry; 346 let entries = count_to_entries.entry(positions.len()).or_default(); 347 entries.push(entry); 348 } 349 count_to_entries 350 } 351 352 fn positions_by_word<C: CompareBytes, S: BuildHasher>( 353 &self, 354 word: HashedWord<'input>, 355 comp: &WordComparator<C, S>, 356 ) -> Option<&[LocalWordPosition]> { 357 let (_, positions) = self 358 .word_to_positions 359 .find(word.hash, |(w, _)| comp.eq(w.text, word.text))?; 360 Some(positions) 361 } 362} 363 364/// Finds the LCS given a array where the value of `input[i]` indicates that 365/// the position of element `i` in the right array is at position `input[i]` in 366/// the left array. 367/// 368/// For example (some have multiple valid outputs): 369/// 370/// [0,1,2] => [(0,0),(1,1),(2,2)] 371/// [2,1,0] => [(0,2)] 372/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)] 373/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)] 374fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { 375 if input.is_empty() { 376 return vec![]; 377 } 378 379 let mut chain = vec![(0, 0, 0); input.len()]; 380 let mut global_longest = 0; 381 let mut global_longest_right_pos = 0; 382 for (right_pos, &left_pos) in input.iter().enumerate() { 383 let mut longest_from_here = 1; 384 let mut previous_right_pos = usize::MAX; 385 for i in (0..right_pos).rev() { 386 let (previous_len, previous_left_pos, _) = chain[i]; 387 if previous_left_pos < left_pos { 388 let len = previous_len + 1; 389 if len > longest_from_here { 390 longest_from_here = len; 391 previous_right_pos = i; 392 if len > global_longest { 393 global_longest = len; 394 global_longest_right_pos = right_pos; 395 // If this is the longest chain globally so far, we cannot find a 396 // longer one by using a previous value, so break early. 397 break; 398 } 399 } 400 } 401 } 402 chain[right_pos] = (longest_from_here, left_pos, previous_right_pos); 403 } 404 405 let mut result = vec![]; 406 let mut right_pos = global_longest_right_pos; 407 loop { 408 let (_, left_pos, previous_right_pos) = chain[right_pos]; 409 result.push((left_pos, right_pos)); 410 if previous_right_pos == usize::MAX { 411 break; 412 } 413 right_pos = previous_right_pos; 414 } 415 result.reverse(); 416 417 result 418} 419 420/// Finds unchanged word (or token) positions among the ones given as 421/// arguments. The data between those words is ignored. 422fn collect_unchanged_words<C: CompareBytes, S: BuildHasher>( 423 found_positions: &mut Vec<(WordPosition, WordPosition)>, 424 left: &LocalDiffSource, 425 right: &LocalDiffSource, 426 comp: &WordComparator<C, S>, 427) { 428 if left.ranges.is_empty() || right.ranges.is_empty() { 429 return; 430 } 431 432 // Prioritize LCS-based algorithm than leading/trailing matches 433 let old_len = found_positions.len(); 434 collect_unchanged_words_lcs(found_positions, left, right, comp); 435 if found_positions.len() != old_len { 436 return; 437 } 438 439 // Trim leading common ranges (i.e. grow previous unchanged region) 440 let common_leading_len = iter::zip(left.hashed_words(), right.hashed_words()) 441 .take_while(|&(l, r)| comp.eq_hashed(l, r)) 442 .count(); 443 let left_hashed_words = left.hashed_words().skip(common_leading_len); 444 let right_hashed_words = right.hashed_words().skip(common_leading_len); 445 446 // Trim trailing common ranges (i.e. grow next unchanged region) 447 let common_trailing_len = iter::zip(left_hashed_words.rev(), right_hashed_words.rev()) 448 .take_while(|&(l, r)| comp.eq_hashed(l, r)) 449 .count(); 450 451 found_positions.extend(itertools::chain( 452 (0..common_leading_len).map(|i| { 453 ( 454 left.map_to_global(LocalWordPosition(i)), 455 right.map_to_global(LocalWordPosition(i)), 456 ) 457 }), 458 (1..=common_trailing_len).rev().map(|i| { 459 ( 460 left.map_to_global(LocalWordPosition(left.ranges.len() - i)), 461 right.map_to_global(LocalWordPosition(right.ranges.len() - i)), 462 ) 463 }), 464 )); 465} 466 467fn collect_unchanged_words_lcs<C: CompareBytes, S: BuildHasher>( 468 found_positions: &mut Vec<(WordPosition, WordPosition)>, 469 left: &LocalDiffSource, 470 right: &LocalDiffSource, 471 comp: &WordComparator<C, S>, 472) { 473 let max_occurrences = 100; 474 let left_histogram = Histogram::calculate(left, comp, max_occurrences); 475 let left_count_to_entries = left_histogram.build_count_to_entries(); 476 if *left_count_to_entries.keys().next().unwrap() > max_occurrences { 477 // If there are very many occurrences of all words, then we just give up. 478 return; 479 } 480 let right_histogram = Histogram::calculate(right, comp, max_occurrences); 481 // Look for words with few occurrences in `left` (could equally well have picked 482 // `right`?). If any of them also occur in `right`, then we add the words to 483 // the LCS. 484 let Some(uncommon_shared_word_positions) = 485 left_count_to_entries.values().find_map(|left_entries| { 486 let mut both_positions = left_entries 487 .iter() 488 .filter_map(|&(word, left_positions)| { 489 let right_positions = right_histogram.positions_by_word(*word, comp)?; 490 (left_positions.len() == right_positions.len()) 491 .then_some((left_positions, right_positions)) 492 }) 493 .peekable(); 494 both_positions.peek().is_some().then_some(both_positions) 495 }) 496 else { 497 return; 498 }; 499 500 // [(index into ranges, serial to identify {word, occurrence #})] 501 let (mut left_positions, mut right_positions): (Vec<_>, Vec<_>) = 502 uncommon_shared_word_positions 503 .flat_map(|(lefts, rights)| iter::zip(lefts, rights)) 504 .enumerate() 505 .map(|(serial, (&left_pos, &right_pos))| ((left_pos, serial), (right_pos, serial))) 506 .unzip(); 507 left_positions.sort_unstable_by_key(|&(pos, _serial)| pos); 508 right_positions.sort_unstable_by_key(|&(pos, _serial)| pos); 509 let left_index_by_right_index: Vec<usize> = { 510 let mut left_index_map = vec![0; left_positions.len()]; 511 for (i, &(_pos, serial)) in left_positions.iter().enumerate() { 512 left_index_map[serial] = i; 513 } 514 right_positions 515 .iter() 516 .map(|&(_pos, serial)| left_index_map[serial]) 517 .collect() 518 }; 519 520 let lcs = find_lcs(&left_index_by_right_index); 521 522 // Produce output word positions, recursing into the modified areas between 523 // the elements in the LCS. 524 let mut previous_left_position = LocalWordPosition(0); 525 let mut previous_right_position = LocalWordPosition(0); 526 for (left_index, right_index) in lcs { 527 let (left_position, _) = left_positions[left_index]; 528 let (right_position, _) = right_positions[right_index]; 529 collect_unchanged_words( 530 found_positions, 531 &left.narrowed(previous_left_position..left_position), 532 &right.narrowed(previous_right_position..right_position), 533 comp, 534 ); 535 found_positions.push(( 536 left.map_to_global(left_position), 537 right.map_to_global(right_position), 538 )); 539 previous_left_position = LocalWordPosition(left_position.0 + 1); 540 previous_right_position = LocalWordPosition(right_position.0 + 1); 541 } 542 // Also recurse into range at end (after common ranges). 543 collect_unchanged_words( 544 found_positions, 545 &left.narrowed(previous_left_position..LocalWordPosition(left.ranges.len())), 546 &right.narrowed(previous_right_position..LocalWordPosition(right.ranges.len())), 547 comp, 548 ); 549} 550 551/// Intersects two sorted sequences of `(base, other)` word positions by 552/// `base`. `base` positions should refer to the same source text. 553fn intersect_unchanged_words( 554 current_positions: Vec<(WordPosition, Vec<WordPosition>)>, 555 new_positions: &[(WordPosition, WordPosition)], 556) -> Vec<(WordPosition, Vec<WordPosition>)> { 557 itertools::merge_join_by( 558 current_positions, 559 new_positions, 560 |(cur_base_pos, _), (new_base_pos, _)| cur_base_pos.cmp(new_base_pos), 561 ) 562 .filter_map(|entry| entry.both()) 563 .map(|((base_pos, mut other_positions), &(_, new_other_pos))| { 564 other_positions.push(new_other_pos); 565 (base_pos, other_positions) 566 }) 567 .collect() 568} 569 570#[derive(Clone, PartialEq, Eq, Debug)] 571struct UnchangedRange { 572 // Inline up to two sides (base + one other) 573 base: Range<usize>, 574 others: SmallVec<[Range<usize>; 1]>, 575} 576 577impl UnchangedRange { 578 /// Translates word positions to byte ranges in the source texts. 579 fn from_word_positions( 580 base_source: &DiffSource, 581 other_sources: &[DiffSource], 582 base_position: WordPosition, 583 other_positions: &[WordPosition], 584 ) -> Self { 585 assert_eq!(other_sources.len(), other_positions.len()); 586 let base = base_source.range_at(base_position); 587 let others = iter::zip(other_sources, other_positions) 588 .map(|(source, pos)| source.range_at(*pos)) 589 .collect(); 590 UnchangedRange { base, others } 591 } 592 593 fn is_all_empty(&self) -> bool { 594 self.base.is_empty() && self.others.iter().all(|r| r.is_empty()) 595 } 596} 597 598/// Takes any number of inputs and finds regions that are them same between all 599/// of them. 600#[derive(Clone, Debug)] 601pub struct Diff<'input> { 602 base_input: &'input BStr, 603 other_inputs: SmallVec<[&'input BStr; 1]>, 604 /// Sorted list of ranges of unchanged regions in bytes. 605 /// 606 /// The list should never be empty. The first and the last region may be 607 /// empty if inputs start/end with changes. 608 unchanged_regions: Vec<UnchangedRange>, 609} 610 611impl<'input> Diff<'input> { 612 pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>( 613 inputs: impl IntoIterator<Item = &'input T>, 614 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>, 615 compare: impl CompareBytes, 616 ) -> Self { 617 let mut inputs = inputs.into_iter().map(BStr::new); 618 let base_input = inputs.next().expect("inputs must not be empty"); 619 let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect(); 620 // First tokenize each input 621 let base_token_ranges: Vec<Range<usize>>; 622 let other_token_ranges: Vec<Vec<Range<usize>>>; 623 // No need to tokenize if one of the inputs is empty. Non-empty inputs 624 // are all different as long as the tokenizer emits non-empty ranges. 625 // This means "" and " " are different even if the compare function is 626 // ignore-whitespace. They are tokenized as [] and [" "] respectively. 627 if base_input.is_empty() || other_inputs.iter().any(|input| input.is_empty()) { 628 base_token_ranges = vec![]; 629 other_token_ranges = std::iter::repeat_n(vec![], other_inputs.len()).collect(); 630 } else { 631 base_token_ranges = tokenizer(base_input); 632 other_token_ranges = other_inputs 633 .iter() 634 .map(|other_input| tokenizer(other_input)) 635 .collect(); 636 } 637 Self::with_inputs_and_token_ranges( 638 base_input, 639 other_inputs, 640 &base_token_ranges, 641 &other_token_ranges, 642 compare, 643 ) 644 } 645 646 fn with_inputs_and_token_ranges( 647 base_input: &'input BStr, 648 other_inputs: SmallVec<[&'input BStr; 1]>, 649 base_token_ranges: &[Range<usize>], 650 other_token_ranges: &[Vec<Range<usize>>], 651 compare: impl CompareBytes, 652 ) -> Self { 653 assert_eq!(other_inputs.len(), other_token_ranges.len()); 654 let comp = WordComparator::new(compare); 655 let base_source = DiffSource::new(base_input, base_token_ranges, &comp); 656 let other_sources = iter::zip(&other_inputs, other_token_ranges) 657 .map(|(input, token_ranges)| DiffSource::new(input, token_ranges, &comp)) 658 .collect_vec(); 659 let unchanged_regions = match &*other_sources { 660 // Consider the whole range of the base input as unchanged compared 661 // to itself. 662 [] => { 663 let whole_range = UnchangedRange { 664 base: 0..base_source.text.len(), 665 others: smallvec![], 666 }; 667 vec![whole_range] 668 } 669 // Diff each other input against the base. Intersect the previously 670 // found ranges with the ranges in the diff. 671 [first_other_source, tail_other_sources @ ..] => { 672 let mut unchanged_regions = Vec::new(); 673 // Add an empty range at the start to make life easier for hunks(). 674 unchanged_regions.push(UnchangedRange { 675 base: 0..0, 676 others: smallvec![0..0; other_inputs.len()], 677 }); 678 let mut first_positions = Vec::new(); 679 collect_unchanged_words( 680 &mut first_positions, 681 &base_source.local(), 682 &first_other_source.local(), 683 &comp, 684 ); 685 if tail_other_sources.is_empty() { 686 unchanged_regions.extend(first_positions.iter().map( 687 |&(base_pos, other_pos)| { 688 UnchangedRange::from_word_positions( 689 &base_source, 690 &other_sources, 691 base_pos, 692 &[other_pos], 693 ) 694 }, 695 )); 696 } else { 697 let first_positions = first_positions 698 .iter() 699 .map(|&(base_pos, other_pos)| (base_pos, vec![other_pos])) 700 .collect(); 701 let intersected_positions = tail_other_sources.iter().fold( 702 first_positions, 703 |current_positions, other_source| { 704 let mut new_positions = Vec::new(); 705 collect_unchanged_words( 706 &mut new_positions, 707 &base_source.local(), 708 &other_source.local(), 709 &comp, 710 ); 711 intersect_unchanged_words(current_positions, &new_positions) 712 }, 713 ); 714 unchanged_regions.extend(intersected_positions.iter().map( 715 |(base_pos, other_positions)| { 716 UnchangedRange::from_word_positions( 717 &base_source, 718 &other_sources, 719 *base_pos, 720 other_positions, 721 ) 722 }, 723 )); 724 }; 725 // Add an empty range at the end to make life easier for hunks(). 726 unchanged_regions.push(UnchangedRange { 727 base: base_input.len()..base_input.len(), 728 others: other_inputs 729 .iter() 730 .map(|input| input.len()..input.len()) 731 .collect(), 732 }); 733 unchanged_regions 734 } 735 }; 736 737 let mut diff = Self { 738 base_input, 739 other_inputs, 740 unchanged_regions, 741 }; 742 diff.compact_unchanged_regions(); 743 diff 744 } 745 746 pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>( 747 inputs: impl IntoIterator<Item = &'input T>, 748 ) -> Self { 749 Diff::for_tokenizer(inputs, |_| vec![], CompareBytesExactly) 750 } 751 752 /// Compares `inputs` line by line. 753 pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>( 754 inputs: impl IntoIterator<Item = &'input T>, 755 ) -> Self { 756 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly) 757 } 758 759 /// Compares `inputs` word by word. 760 /// 761 /// The `inputs` is usually a changed hunk (e.g. a `DiffHunk::Different`) 762 /// that was the output from a line-by-line diff. 763 pub fn by_word<T: AsRef<[u8]> + ?Sized + 'input>( 764 inputs: impl IntoIterator<Item = &'input T>, 765 ) -> Self { 766 let mut diff = Diff::for_tokenizer(inputs, find_word_ranges, CompareBytesExactly); 767 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); 768 diff 769 } 770 771 /// Returns iterator over matching and different texts. 772 pub fn hunks(&self) -> DiffHunkIterator<'_, 'input> { 773 let ranges = self.hunk_ranges(); 774 DiffHunkIterator { diff: self, ranges } 775 } 776 777 /// Returns iterator over matching and different ranges in bytes. 778 pub fn hunk_ranges(&self) -> DiffHunkRangeIterator<'_> { 779 DiffHunkRangeIterator::new(self) 780 } 781 782 /// Returns contents at the unchanged `range`. 783 fn hunk_at<'a, 'b>( 784 &'a self, 785 range: &'b UnchangedRange, 786 ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> { 787 itertools::chain( 788 iter::once(&self.base_input[range.base.clone()]), 789 iter::zip(&self.other_inputs, &range.others).map(|(input, r)| &input[r.clone()]), 790 ) 791 } 792 793 /// Returns contents between the `previous` ends and the `current` starts. 794 fn hunk_between<'a, 'b>( 795 &'a self, 796 previous: &'b UnchangedRange, 797 current: &'b UnchangedRange, 798 ) -> impl Iterator<Item = &'input BStr> + use<'a, 'b, 'input> { 799 itertools::chain( 800 iter::once(&self.base_input[previous.base.end..current.base.start]), 801 itertools::izip!(&self.other_inputs, &previous.others, &current.others) 802 .map(|(input, prev, cur)| &input[prev.end..cur.start]), 803 ) 804 } 805 806 /// Uses the given tokenizer to split the changed regions into smaller 807 /// regions. Then tries to finds unchanged regions among them. 808 pub fn refine_changed_regions( 809 &mut self, 810 tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>, 811 compare: impl CompareBytes, 812 ) { 813 let mut new_unchanged_ranges = vec![self.unchanged_regions[0].clone()]; 814 for window in self.unchanged_regions.windows(2) { 815 let [previous, current]: &[_; 2] = window.try_into().unwrap(); 816 // For the changed region between the previous region and the current one, 817 // create a new Diff instance. Then adjust the start positions and 818 // offsets to be valid in the context of the larger Diff instance 819 // (`self`). 820 let refined_diff = 821 Diff::for_tokenizer(self.hunk_between(previous, current), &tokenizer, &compare); 822 for refined in &refined_diff.unchanged_regions { 823 let new_base_start = refined.base.start + previous.base.end; 824 let new_base_end = refined.base.end + previous.base.end; 825 let new_others = iter::zip(&refined.others, &previous.others) 826 .map(|(refi, prev)| (refi.start + prev.end)..(refi.end + prev.end)) 827 .collect(); 828 new_unchanged_ranges.push(UnchangedRange { 829 base: new_base_start..new_base_end, 830 others: new_others, 831 }); 832 } 833 new_unchanged_ranges.push(current.clone()); 834 } 835 self.unchanged_regions = new_unchanged_ranges; 836 self.compact_unchanged_regions(); 837 } 838 839 fn compact_unchanged_regions(&mut self) { 840 let mut compacted = vec![]; 841 let mut maybe_previous: Option<UnchangedRange> = None; 842 for current in &self.unchanged_regions { 843 if let Some(previous) = maybe_previous { 844 if previous.base.end == current.base.start 845 && iter::zip(&previous.others, &current.others) 846 .all(|(prev, cur)| prev.end == cur.start) 847 { 848 maybe_previous = Some(UnchangedRange { 849 base: previous.base.start..current.base.end, 850 others: iter::zip(&previous.others, &current.others) 851 .map(|(prev, cur)| prev.start..cur.end) 852 .collect(), 853 }); 854 continue; 855 } 856 compacted.push(previous); 857 } 858 maybe_previous = Some(current.clone()); 859 } 860 if let Some(previous) = maybe_previous { 861 compacted.push(previous); 862 } 863 self.unchanged_regions = compacted; 864 } 865} 866 867/// Hunk texts. 868#[derive(Clone, Debug, Eq, PartialEq)] 869pub struct DiffHunk<'input> { 870 pub kind: DiffHunkKind, 871 pub contents: DiffHunkContentVec<'input>, 872} 873 874impl<'input> DiffHunk<'input> { 875 pub fn matching<T: AsRef<[u8]> + ?Sized + 'input>( 876 contents: impl IntoIterator<Item = &'input T>, 877 ) -> Self { 878 DiffHunk { 879 kind: DiffHunkKind::Matching, 880 contents: contents.into_iter().map(BStr::new).collect(), 881 } 882 } 883 884 pub fn different<T: AsRef<[u8]> + ?Sized + 'input>( 885 contents: impl IntoIterator<Item = &'input T>, 886 ) -> Self { 887 DiffHunk { 888 kind: DiffHunkKind::Different, 889 contents: contents.into_iter().map(BStr::new).collect(), 890 } 891 } 892} 893 894#[derive(Clone, Copy, Debug, Eq, PartialEq)] 895pub enum DiffHunkKind { 896 Matching, 897 Different, 898} 899 900// Inline up to two sides 901pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>; 902 903/// Iterator over matching and different texts. 904#[derive(Clone, Debug)] 905pub struct DiffHunkIterator<'diff, 'input> { 906 diff: &'diff Diff<'input>, 907 ranges: DiffHunkRangeIterator<'diff>, 908} 909 910impl<'input> Iterator for DiffHunkIterator<'_, 'input> { 911 type Item = DiffHunk<'input>; 912 913 fn next(&mut self) -> Option<Self::Item> { 914 self.ranges.next_with( 915 |previous| { 916 let contents = self.diff.hunk_at(previous).collect(); 917 let kind = DiffHunkKind::Matching; 918 DiffHunk { kind, contents } 919 }, 920 |previous, current| { 921 let contents: DiffHunkContentVec = 922 self.diff.hunk_between(previous, current).collect(); 923 debug_assert!( 924 contents.iter().any(|content| !content.is_empty()), 925 "unchanged regions should have been compacted" 926 ); 927 let kind = DiffHunkKind::Different; 928 DiffHunk { kind, contents } 929 }, 930 ) 931 } 932} 933 934/// Hunk ranges in bytes. 935#[derive(Clone, Debug, Eq, PartialEq)] 936pub struct DiffHunkRange { 937 pub kind: DiffHunkKind, 938 pub ranges: DiffHunkRangeVec, 939} 940 941// Inline up to two sides 942pub type DiffHunkRangeVec = SmallVec<[Range<usize>; 2]>; 943 944/// Iterator over matching and different ranges in bytes. 945#[derive(Clone, Debug)] 946pub struct DiffHunkRangeIterator<'diff> { 947 previous: &'diff UnchangedRange, 948 unchanged_emitted: bool, 949 unchanged_iter: slice::Iter<'diff, UnchangedRange>, 950} 951 952impl<'diff> DiffHunkRangeIterator<'diff> { 953 fn new(diff: &'diff Diff) -> Self { 954 let mut unchanged_iter = diff.unchanged_regions.iter(); 955 let previous = unchanged_iter.next().unwrap(); 956 DiffHunkRangeIterator { 957 previous, 958 unchanged_emitted: previous.is_all_empty(), 959 unchanged_iter, 960 } 961 } 962 963 fn next_with<T>( 964 &mut self, 965 hunk_at: impl FnOnce(&UnchangedRange) -> T, 966 hunk_between: impl FnOnce(&UnchangedRange, &UnchangedRange) -> T, 967 ) -> Option<T> { 968 if !self.unchanged_emitted { 969 self.unchanged_emitted = true; 970 return Some(hunk_at(self.previous)); 971 } 972 let current = self.unchanged_iter.next()?; 973 let hunk = hunk_between(self.previous, current); 974 self.previous = current; 975 self.unchanged_emitted = self.previous.is_all_empty(); 976 Some(hunk) 977 } 978} 979 980impl Iterator for DiffHunkRangeIterator<'_> { 981 type Item = DiffHunkRange; 982 983 fn next(&mut self) -> Option<Self::Item> { 984 self.next_with( 985 |previous| { 986 let ranges = itertools::chain(iter::once(&previous.base), &previous.others) 987 .cloned() 988 .collect(); 989 let kind = DiffHunkKind::Matching; 990 DiffHunkRange { kind, ranges } 991 }, 992 |previous, current| { 993 let ranges: DiffHunkRangeVec = itertools::chain( 994 iter::once(previous.base.end..current.base.start), 995 iter::zip(&previous.others, &current.others) 996 .map(|(prev, cur)| prev.end..cur.start), 997 ) 998 .collect(); 999 debug_assert!( 1000 ranges.iter().any(|range| !range.is_empty()), 1001 "unchanged regions should have been compacted" 1002 ); 1003 let kind = DiffHunkKind::Different; 1004 DiffHunkRange { kind, ranges } 1005 }, 1006 ) 1007 } 1008} 1009 1010/// Diffs slices of bytes. 1011/// 1012/// The returned diff hunks may be any length (may span many lines or 1013/// may be only part of a line). This currently uses Histogram diff 1014/// (or maybe something similar; I'm not sure I understood the 1015/// algorithm correctly). It first diffs lines in the input and then 1016/// refines the changed ranges at the word level. 1017pub fn diff<'a, T: AsRef<[u8]> + ?Sized + 'a>( 1018 inputs: impl IntoIterator<Item = &'a T>, 1019) -> Vec<DiffHunk<'a>> { 1020 let mut diff = Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesExactly); 1021 diff.refine_changed_regions(find_word_ranges, CompareBytesExactly); 1022 diff.refine_changed_regions(find_nonword_ranges, CompareBytesExactly); 1023 diff.hunks().collect() 1024} 1025 1026#[cfg(test)] 1027mod tests { 1028 use super::*; 1029 1030 // Extracted to a function because type inference is ambiguous due to 1031 // `impl PartialEq<aho_corasick::util::search::Span> for std::ops::Range<usize>` 1032 fn no_ranges() -> Vec<Range<usize>> { 1033 vec![] 1034 } 1035 1036 #[test] 1037 fn test_find_line_ranges_empty() { 1038 assert_eq!(find_line_ranges(b""), no_ranges()); 1039 } 1040 1041 #[test] 1042 fn test_find_line_ranges_blank_line() { 1043 assert_eq!(find_line_ranges(b"\n"), vec![0..1]); 1044 } 1045 1046 #[test] 1047 fn test_find_line_ranges_missing_newline_at_eof() { 1048 assert_eq!(find_line_ranges(b"foo"), vec![0..3]); 1049 } 1050 1051 #[test] 1052 fn test_find_line_ranges_multiple_lines() { 1053 assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]); 1054 } 1055 1056 #[test] 1057 fn test_find_word_ranges_empty() { 1058 assert_eq!(find_word_ranges(b""), no_ranges()); 1059 } 1060 1061 #[test] 1062 fn test_find_word_ranges_single_word() { 1063 assert_eq!(find_word_ranges(b"Abc"), vec![0..3]); 1064 } 1065 1066 #[test] 1067 fn test_find_word_ranges_no_word() { 1068 assert_eq!(find_word_ranges(b"+-*/"), no_ranges()); 1069 } 1070 1071 #[test] 1072 fn test_find_word_ranges_word_then_non_word() { 1073 assert_eq!(find_word_ranges(b"Abc "), vec![0..3]); 1074 } 1075 1076 #[test] 1077 fn test_find_word_ranges_non_word_then_word() { 1078 assert_eq!(find_word_ranges(b" Abc"), vec![3..6]); 1079 } 1080 1081 #[test] 1082 fn test_find_word_ranges_multibyte() { 1083 assert_eq!(find_word_ranges("".as_bytes()), vec![0..3]); 1084 } 1085 1086 #[test] 1087 fn test_find_lcs_empty() { 1088 let empty: Vec<(usize, usize)> = vec![]; 1089 assert_eq!(find_lcs(&[]), empty); 1090 } 1091 1092 #[test] 1093 fn test_find_lcs_single_element() { 1094 assert_eq!(find_lcs(&[0]), vec![(0, 0)]); 1095 } 1096 1097 #[test] 1098 fn test_find_lcs_in_order() { 1099 assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]); 1100 } 1101 1102 #[test] 1103 fn test_find_lcs_reverse_order() { 1104 assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]); 1105 } 1106 1107 #[test] 1108 fn test_find_lcs_two_swapped() { 1109 assert_eq!( 1110 find_lcs(&[0, 1, 4, 3, 2, 5, 6]), 1111 vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)] 1112 ); 1113 } 1114 1115 #[test] 1116 fn test_find_lcs_element_moved_earlier() { 1117 assert_eq!( 1118 find_lcs(&[0, 1, 4, 2, 3, 5, 6]), 1119 vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)] 1120 ); 1121 } 1122 1123 #[test] 1124 fn test_find_lcs_element_moved_later() { 1125 assert_eq!( 1126 find_lcs(&[0, 1, 3, 4, 2, 5, 6]), 1127 vec![(0, 0), (1, 1), (3, 2), (4, 3), (5, 5), (6, 6)] 1128 ); 1129 } 1130 1131 #[test] 1132 fn test_find_lcs_interleaved_longest_chains() { 1133 assert_eq!( 1134 find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]), 1135 vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)] 1136 ); 1137 } 1138 1139 #[test] 1140 fn test_find_word_ranges_many_words() { 1141 assert_eq!( 1142 find_word_ranges(b"fn find_words(text: &[u8])"), 1143 vec![0..2, 3..13, 14..18, 22..24] 1144 ); 1145 } 1146 1147 #[test] 1148 fn test_compare_bytes_ignore_all_whitespace() { 1149 let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace); 1150 let hash = |data: &[u8]| comp.hash_one(data); 1151 1152 assert!(comp.eq(b"", b"")); 1153 assert!(comp.eq(b"", b" ")); 1154 assert!(comp.eq(b"\t", b"\r")); 1155 assert_eq!(hash(b""), hash(b"")); 1156 assert_eq!(hash(b""), hash(b" ")); 1157 assert_eq!(hash(b""), hash(b"\t")); 1158 assert_eq!(hash(b""), hash(b"\r")); 1159 1160 assert!(comp.eq(b"ab", b" a b\t")); 1161 assert_eq!(hash(b"ab"), hash(b" a b\t")); 1162 1163 assert!(!comp.eq(b"a", b"")); 1164 assert!(!comp.eq(b"a", b" ")); 1165 assert!(!comp.eq(b"a", b"ab")); 1166 assert!(!comp.eq(b"ab", b"ba")); 1167 } 1168 1169 #[test] 1170 fn test_compare_bytes_ignore_whitespace_amount() { 1171 let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount); 1172 let hash = |data: &[u8]| comp.hash_one(data); 1173 1174 assert!(comp.eq(b"", b"")); 1175 assert!(comp.eq(b"\n", b" \n")); 1176 assert!(comp.eq(b"\t", b"\r")); 1177 assert_eq!(hash(b""), hash(b"")); 1178 assert_eq!(hash(b" "), hash(b"\n")); 1179 assert_eq!(hash(b" "), hash(b" \n")); 1180 assert_eq!(hash(b" "), hash(b"\t")); 1181 assert_eq!(hash(b" "), hash(b"\r")); 1182 1183 assert!(comp.eq(b"a b c\n", b"a b\tc\r\n")); 1184 assert_eq!(hash(b"a b c\n"), hash(b"a b\tc\r\n")); 1185 1186 assert!(!comp.eq(b"", b" ")); 1187 assert!(!comp.eq(b"a", b"")); 1188 assert!(!comp.eq(b"a", b" ")); 1189 assert!(!comp.eq(b"a", b"a ")); 1190 assert!(!comp.eq(b"a", b" a")); 1191 assert!(!comp.eq(b"a", b"ab")); 1192 assert!(!comp.eq(b"ab", b"ba")); 1193 assert!(!comp.eq(b"ab", b"a b")); 1194 } 1195 1196 fn unchanged_ranges( 1197 (left_text, left_ranges): (&[u8], &[Range<usize>]), 1198 (right_text, right_ranges): (&[u8], &[Range<usize>]), 1199 ) -> Vec<(Range<usize>, Range<usize>)> { 1200 let comp = WordComparator::new(CompareBytesExactly); 1201 let left = DiffSource::new(left_text, left_ranges, &comp); 1202 let right = DiffSource::new(right_text, right_ranges, &comp); 1203 let mut positions = Vec::new(); 1204 collect_unchanged_words(&mut positions, &left.local(), &right.local(), &comp); 1205 positions 1206 .into_iter() 1207 .map(|(left_pos, right_pos)| (left.range_at(left_pos), right.range_at(right_pos))) 1208 .collect() 1209 } 1210 1211 #[test] 1212 fn test_unchanged_ranges_insert_in_middle() { 1213 assert_eq!( 1214 unchanged_ranges( 1215 (b"a b b c", &[0..1, 2..3, 4..5, 6..7]), 1216 (b"a b X b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), 1217 ), 1218 vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] 1219 ); 1220 } 1221 1222 #[test] 1223 fn test_unchanged_ranges_non_unique_removed() { 1224 // We used to consider the first two "a" in the first input to match the two 1225 // "a"s in the second input. We no longer do. 1226 assert_eq!( 1227 unchanged_ranges( 1228 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1229 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), 1230 ), 1231 vec![(0..1, 0..1)] 1232 ); 1233 assert_eq!( 1234 unchanged_ranges( 1235 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1236 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), 1237 ), 1238 vec![(6..7, 6..7)] 1239 ); 1240 assert_eq!( 1241 unchanged_ranges( 1242 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1243 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), 1244 ), 1245 vec![] 1246 ); 1247 assert_eq!( 1248 unchanged_ranges( 1249 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1250 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), 1251 ), 1252 vec![(0..1, 0..1), (6..7, 6..7)] 1253 ); 1254 } 1255 1256 #[test] 1257 fn test_unchanged_ranges_non_unique_added() { 1258 // We used to consider the first two "a" in the first input to match the two 1259 // "a"s in the second input. We no longer do. 1260 assert_eq!( 1261 unchanged_ranges( 1262 (b"a b a c", &[0..1, 2..3, 4..5, 6..7]), 1263 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1264 ), 1265 vec![(0..1, 0..1)] 1266 ); 1267 assert_eq!( 1268 unchanged_ranges( 1269 (b"b a c a", &[0..1, 2..3, 4..5, 6..7]), 1270 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1271 ), 1272 vec![(6..7, 6..7)] 1273 ); 1274 assert_eq!( 1275 unchanged_ranges( 1276 (b"b a a c", &[0..1, 2..3, 4..5, 6..7]), 1277 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1278 ), 1279 vec![] 1280 ); 1281 assert_eq!( 1282 unchanged_ranges( 1283 (b"a b c a", &[0..1, 2..3, 4..5, 6..7]), 1284 (b"a a a a", &[0..1, 2..3, 4..5, 6..7]), 1285 ), 1286 vec![(0..1, 0..1), (6..7, 6..7)] 1287 ); 1288 } 1289 1290 #[test] 1291 fn test_unchanged_ranges_recursion_needed() { 1292 // "|" matches first, then "b" matches within the left/right range. 1293 assert_eq!( 1294 unchanged_ranges( 1295 (b"a b | b", &[0..1, 2..3, 4..5, 6..7]), 1296 (b"b c d |", &[0..1, 2..3, 4..5, 6..7]), 1297 ), 1298 vec![(2..3, 0..1), (4..5, 6..7)] 1299 ); 1300 assert_eq!( 1301 unchanged_ranges( 1302 (b"| b c d", &[0..1, 2..3, 4..5, 6..7]), 1303 (b"b | a b", &[0..1, 2..3, 4..5, 6..7]), 1304 ), 1305 vec![(0..1, 2..3), (2..3, 6..7)] 1306 ); 1307 // "|" matches first, then the middle range is trimmed. 1308 assert_eq!( 1309 unchanged_ranges( 1310 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), 1311 (b"| b b |", &[0..1, 2..3, 4..5, 6..7]), 1312 ), 1313 vec![(0..1, 0..1), (2..3, 2..3), (6..7, 6..7)] 1314 ); 1315 assert_eq!( 1316 unchanged_ranges( 1317 (b"| c c |", &[0..1, 2..3, 4..5, 6..7]), 1318 (b"| b c |", &[0..1, 2..3, 4..5, 6..7]), 1319 ), 1320 vec![(0..1, 0..1), (4..5, 4..5), (6..7, 6..7)] 1321 ); 1322 // "|" matches first, then "a", then "b". 1323 assert_eq!( 1324 unchanged_ranges( 1325 (b"a b c | a", &[0..1, 2..3, 4..5, 6..7, 8..9]), 1326 (b"b a b |", &[0..1, 2..3, 4..5, 6..7]), 1327 ), 1328 vec![(0..1, 2..3), (2..3, 4..5), (6..7, 6..7)] 1329 ); 1330 assert_eq!( 1331 unchanged_ranges( 1332 (b"| b a b", &[0..1, 2..3, 4..5, 6..7]), 1333 (b"a | a b c", &[0..1, 2..3, 4..5, 6..7, 8..9]), 1334 ), 1335 vec![(0..1, 2..3), (4..5, 4..5), (6..7, 6..7)] 1336 ); 1337 } 1338 1339 #[test] 1340 fn test_diff_single_input() { 1341 assert_eq!(diff(["abc"]), vec![DiffHunk::matching(["abc"])]); 1342 } 1343 1344 #[test] 1345 fn test_diff_some_empty_inputs() { 1346 // All empty 1347 assert_eq!(diff([""]), vec![]); 1348 assert_eq!(diff(["", ""]), vec![]); 1349 assert_eq!(diff(["", "", ""]), vec![]); 1350 1351 // One empty 1352 assert_eq!(diff(["a b", ""]), vec![DiffHunk::different(["a b", ""])]); 1353 assert_eq!(diff(["", "a b"]), vec![DiffHunk::different(["", "a b"])]); 1354 1355 // One empty, two match 1356 assert_eq!( 1357 diff(["a b", "", "a b"]), 1358 vec![DiffHunk::different(["a b", "", "a b"])] 1359 ); 1360 assert_eq!( 1361 diff(["", "a b", "a b"]), 1362 vec![DiffHunk::different(["", "a b", "a b"])] 1363 ); 1364 1365 // Two empty, one differs 1366 assert_eq!( 1367 diff(["a b", "", ""]), 1368 vec![DiffHunk::different(["a b", "", ""])] 1369 ); 1370 assert_eq!( 1371 diff(["", "a b", ""]), 1372 vec![DiffHunk::different(["", "a b", ""])] 1373 ); 1374 } 1375 1376 #[test] 1377 fn test_diff_two_inputs_one_different() { 1378 assert_eq!( 1379 diff(["a b c", "a X c"]), 1380 vec![ 1381 DiffHunk::matching(["a "].repeat(2)), 1382 DiffHunk::different(["b", "X"]), 1383 DiffHunk::matching([" c"].repeat(2)), 1384 ] 1385 ); 1386 } 1387 1388 #[test] 1389 fn test_diff_multiple_inputs_one_different() { 1390 assert_eq!( 1391 diff(["a b c", "a X c", "a b c"]), 1392 vec![ 1393 DiffHunk::matching(["a "].repeat(3)), 1394 DiffHunk::different(["b", "X", "b"]), 1395 DiffHunk::matching([" c"].repeat(3)), 1396 ] 1397 ); 1398 } 1399 1400 #[test] 1401 fn test_diff_multiple_inputs_all_different() { 1402 assert_eq!( 1403 diff(["a b c", "a X c", "a c X"]), 1404 vec![ 1405 DiffHunk::matching(["a "].repeat(3)), 1406 DiffHunk::different(["b ", "X ", ""]), 1407 DiffHunk::matching(["c"].repeat(3)), 1408 DiffHunk::different(["", "", " X"]), 1409 ] 1410 ); 1411 } 1412 1413 #[test] 1414 fn test_diff_for_tokenizer_compacted() { 1415 // Tests that unchanged regions are compacted when using for_tokenizer() 1416 let diff = Diff::for_tokenizer( 1417 ["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"], 1418 find_line_ranges, 1419 CompareBytesExactly, 1420 ); 1421 assert_eq!( 1422 diff.hunks().collect_vec(), 1423 vec![ 1424 DiffHunk::matching(["a\nb\nc\n"].repeat(2)), 1425 DiffHunk::different(["d\n", "X\n"]), 1426 DiffHunk::matching(["e\nf\ng"].repeat(2)), 1427 ] 1428 ); 1429 } 1430 1431 #[test] 1432 fn test_diff_nothing_in_common() { 1433 assert_eq!( 1434 diff(["aaa", "bb"]), 1435 vec![DiffHunk::different(["aaa", "bb"])] 1436 ); 1437 } 1438 1439 #[test] 1440 fn test_diff_insert_in_middle() { 1441 assert_eq!( 1442 diff(["a z", "a S z"]), 1443 vec![ 1444 DiffHunk::matching(["a "].repeat(2)), 1445 DiffHunk::different(["", "S "]), 1446 DiffHunk::matching(["z"].repeat(2)), 1447 ] 1448 ); 1449 } 1450 1451 #[test] 1452 fn test_diff_no_unique_middle_flips() { 1453 assert_eq!( 1454 diff(["a R R S S z", "a S S R R z"]), 1455 vec![ 1456 DiffHunk::matching(["a "].repeat(2)), 1457 DiffHunk::different(["R R ", ""]), 1458 DiffHunk::matching(["S S "].repeat(2)), 1459 DiffHunk::different(["", "R R "]), 1460 DiffHunk::matching(["z"].repeat(2)) 1461 ], 1462 ); 1463 } 1464 1465 #[test] 1466 fn test_diff_recursion_needed() { 1467 assert_eq!( 1468 diff([ 1469 "a q x q y q z q b q y q x q c", 1470 "a r r x q y z q b y q x r r c", 1471 ]), 1472 vec![ 1473 DiffHunk::matching(["a "].repeat(2)), 1474 DiffHunk::different(["q", "r"]), 1475 DiffHunk::matching([" "].repeat(2)), 1476 DiffHunk::different(["", "r "]), 1477 DiffHunk::matching(["x q y "].repeat(2)), 1478 DiffHunk::different(["q ", ""]), 1479 DiffHunk::matching(["z q b "].repeat(2)), 1480 DiffHunk::different(["q ", ""]), 1481 DiffHunk::matching(["y q x "].repeat(2)), 1482 DiffHunk::different(["q", "r"]), 1483 DiffHunk::matching([" "].repeat(2)), 1484 DiffHunk::different(["", "r "]), 1485 DiffHunk::matching(["c"].repeat(2)), 1486 ] 1487 ); 1488 } 1489 1490 #[test] 1491 fn test_diff_ignore_all_whitespace() { 1492 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> { 1493 let diff = 1494 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreAllWhitespace); 1495 diff.hunks().collect() 1496 } 1497 1498 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); 1499 assert_eq!( 1500 diff(["a\n", " a\r\n"]), 1501 vec![DiffHunk::matching(["a\n", " a\r\n"])] 1502 ); 1503 assert_eq!( 1504 diff(["a\n", " a\nb"]), 1505 vec![ 1506 DiffHunk::matching(["a\n", " a\n"]), 1507 DiffHunk::different(["", "b"]), 1508 ] 1509 ); 1510 1511 // No LCS matches, so trim leading/trailing common lines 1512 assert_eq!( 1513 diff(["a\nc\n", " a\n a\n"]), 1514 vec![ 1515 DiffHunk::matching(["a\n", " a\n"]), 1516 DiffHunk::different(["c\n", " a\n"]), 1517 ] 1518 ); 1519 assert_eq!( 1520 diff(["c\na\n", " a\n a\n"]), 1521 vec![ 1522 DiffHunk::different(["c\n", " a\n"]), 1523 DiffHunk::matching(["a\n", " a\n"]), 1524 ] 1525 ); 1526 } 1527 1528 #[test] 1529 fn test_diff_ignore_whitespace_amount() { 1530 fn diff(inputs: [&str; 2]) -> Vec<DiffHunk<'_>> { 1531 let diff = 1532 Diff::for_tokenizer(inputs, find_line_ranges, CompareBytesIgnoreWhitespaceAmount); 1533 diff.hunks().collect() 1534 } 1535 1536 assert_eq!(diff(["", "\n"]), vec![DiffHunk::different(["", "\n"])]); 1537 // whitespace at line end is ignored 1538 assert_eq!( 1539 diff(["a\n", "a\r\n"]), 1540 vec![DiffHunk::matching(["a\n", "a\r\n"])] 1541 ); 1542 // but whitespace at line start isn't 1543 assert_eq!( 1544 diff(["a\n", " a\n"]), 1545 vec![DiffHunk::different(["a\n", " a\n"])] 1546 ); 1547 assert_eq!( 1548 diff(["a\n", "a \nb"]), 1549 vec![ 1550 DiffHunk::matching(["a\n", "a \n"]), 1551 DiffHunk::different(["", "b"]), 1552 ] 1553 ); 1554 } 1555 1556 #[test] 1557 fn test_diff_hunk_iterator() { 1558 let diff = Diff::by_word(["a b c", "a XX c", "a b "]); 1559 assert_eq!( 1560 diff.hunks().collect_vec(), 1561 vec![ 1562 DiffHunk::matching(["a "].repeat(3)), 1563 DiffHunk::different(["b", "XX", "b"]), 1564 DiffHunk::matching([" "].repeat(3)), 1565 DiffHunk::different(["c", "c", ""]), 1566 ] 1567 ); 1568 assert_eq!( 1569 diff.hunk_ranges().collect_vec(), 1570 vec![ 1571 DiffHunkRange { 1572 kind: DiffHunkKind::Matching, 1573 ranges: smallvec![0..2, 0..2, 0..2], 1574 }, 1575 DiffHunkRange { 1576 kind: DiffHunkKind::Different, 1577 ranges: smallvec![2..3, 2..4, 2..3], 1578 }, 1579 DiffHunkRange { 1580 kind: DiffHunkKind::Matching, 1581 ranges: smallvec![3..4, 4..5, 3..4], 1582 }, 1583 DiffHunkRange { 1584 kind: DiffHunkKind::Different, 1585 ranges: smallvec![4..5, 5..6, 4..4], 1586 }, 1587 ] 1588 ); 1589 } 1590 1591 #[test] 1592 fn test_diff_real_case_write_fmt() { 1593 // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the 1594 // need for recursion into the range at the end: after splitting at "Arguments" 1595 // and "formatter", the region at the end has the unique words "write_fmt" 1596 // and "fmt", but we forgot to recurse into that region, so we ended up 1597 // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)". 1598 #[rustfmt::skip] 1599 assert_eq!( 1600 diff([ 1601 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n", 1602 " pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n" 1603 ]), 1604 vec![ 1605 DiffHunk::matching([" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) "].repeat(2)), 1606 DiffHunk::different(["", "-> io::Result<()> "]), 1607 DiffHunk::matching(["{\n self.styler().write_fmt(fmt)"].repeat(2)), 1608 DiffHunk::different([".unwrap()", ""]), 1609 DiffHunk::matching(["\n"].repeat(2)) 1610 ] 1611 ); 1612 } 1613 1614 #[test] 1615 fn test_diff_real_case_gitgit_read_tree_c() { 1616 // This is the diff from commit e497ea2a9b in the git.git repo 1617 #[rustfmt::skip] 1618 assert_eq!( 1619 diff([ 1620 r##"/* 1621 * GIT - The information manager from hell 1622 * 1623 * Copyright (C) Linus Torvalds, 2005 1624 */ 1625#include "#cache.h" 1626 1627static int unpack(unsigned char *sha1) 1628{ 1629 void *buffer; 1630 unsigned long size; 1631 char type[20]; 1632 1633 buffer = read_sha1_file(sha1, type, &size); 1634 if (!buffer) 1635 usage("unable to read sha1 file"); 1636 if (strcmp(type, "tree")) 1637 usage("expected a 'tree' node"); 1638 while (size) { 1639 int len = strlen(buffer)+1; 1640 unsigned char *sha1 = buffer + len; 1641 char *path = strchr(buffer, ' ')+1; 1642 unsigned int mode; 1643 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) 1644 usage("corrupt 'tree' file"); 1645 buffer = sha1 + 20; 1646 size -= len + 20; 1647 printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1)); 1648 } 1649 return 0; 1650} 1651 1652int main(int argc, char **argv) 1653{ 1654 int fd; 1655 unsigned char sha1[20]; 1656 1657 if (argc != 2) 1658 usage("read-tree <key>"); 1659 if (get_sha1_hex(argv[1], sha1) < 0) 1660 usage("read-tree <key>"); 1661 sha1_file_directory = getenv(DB_ENVIRONMENT); 1662 if (!sha1_file_directory) 1663 sha1_file_directory = DEFAULT_DB_ENVIRONMENT; 1664 if (unpack(sha1) < 0) 1665 usage("unpack failed"); 1666 return 0; 1667} 1668"##, 1669 r##"/* 1670 * GIT - The information manager from hell 1671 * 1672 * Copyright (C) Linus Torvalds, 2005 1673 */ 1674#include "#cache.h" 1675 1676static void create_directories(const char *path) 1677{ 1678 int len = strlen(path); 1679 char *buf = malloc(len + 1); 1680 const char *slash = path; 1681 1682 while ((slash = strchr(slash+1, '/')) != NULL) { 1683 len = slash - path; 1684 memcpy(buf, path, len); 1685 buf[len] = 0; 1686 mkdir(buf, 0700); 1687 } 1688} 1689 1690static int create_file(const char *path) 1691{ 1692 int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); 1693 if (fd < 0) { 1694 if (errno == ENOENT) { 1695 create_directories(path); 1696 fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); 1697 } 1698 } 1699 return fd; 1700} 1701 1702static int unpack(unsigned char *sha1) 1703{ 1704 void *buffer; 1705 unsigned long size; 1706 char type[20]; 1707 1708 buffer = read_sha1_file(sha1, type, &size); 1709 if (!buffer) 1710 usage("unable to read sha1 file"); 1711 if (strcmp(type, "tree")) 1712 usage("expected a 'tree' node"); 1713 while (size) { 1714 int len = strlen(buffer)+1; 1715 unsigned char *sha1 = buffer + len; 1716 char *path = strchr(buffer, ' ')+1; 1717 char *data; 1718 unsigned long filesize; 1719 unsigned int mode; 1720 int fd; 1721 1722 if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) 1723 usage("corrupt 'tree' file"); 1724 buffer = sha1 + 20; 1725 size -= len + 20; 1726 data = read_sha1_file(sha1, type, &filesize); 1727 if (!data || strcmp(type, "blob")) 1728 usage("tree file refers to bad file data"); 1729 fd = create_file(path); 1730 if (fd < 0) 1731 usage("unable to create file"); 1732 if (write(fd, data, filesize) != filesize) 1733 usage("unable to write file"); 1734 fchmod(fd, mode); 1735 close(fd); 1736 free(data); 1737 } 1738 return 0; 1739} 1740 1741int main(int argc, char **argv) 1742{ 1743 int fd; 1744 unsigned char sha1[20]; 1745 1746 if (argc != 2) 1747 usage("read-tree <key>"); 1748 if (get_sha1_hex(argv[1], sha1) < 0) 1749 usage("read-tree <key>"); 1750 sha1_file_directory = getenv(DB_ENVIRONMENT); 1751 if (!sha1_file_directory) 1752 sha1_file_directory = DEFAULT_DB_ENVIRONMENT; 1753 if (unpack(sha1) < 0) 1754 usage("unpack failed"); 1755 return 0; 1756} 1757"##, 1758 ]), 1759 vec![ 1760 DiffHunk::matching(["/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\n"].repeat(2)), 1761 DiffHunk::different(["", "static void create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\n"]), 1762 DiffHunk::matching(["static int unpack(unsigned char *sha1)\n{\n\tvoid *buffer;\n\tunsigned long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1;\n"].repeat(2)), 1763 DiffHunk::different(["", "\t\tchar *data;\n\t\tunsigned long filesize;\n"]), 1764 DiffHunk::matching(["\t\tunsigned int mode;\n"].repeat(2)), 1765 DiffHunk::different(["", "\t\tint fd;\n\n"]), 1766 DiffHunk::matching(["\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n\t\t"].repeat(2)), 1767 DiffHunk::different(["printf(\"%o %s (%s)\\n\", mode, path,", "data ="]), 1768 DiffHunk::matching([" "].repeat(2)), 1769 DiffHunk::different(["sha1_to_hex", "read_sha1_file"]), 1770 DiffHunk::matching(["(sha1"].repeat(2)), 1771 DiffHunk::different([")", ", type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, mode);\n\t\tclose(fd);\n\t\tfree(data"]), 1772 DiffHunk::matching([");\n\t}\n\treturn 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree <key>\");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree <key>\");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n"].repeat(2)), 1773 ] 1774 ); 1775 } 1776}