blob: 733c5ade02615c80fda7ead4376d154c513493c1 [file] [log] [blame]
//! Myers' diff algorithm.
//!
//! * time: `O((N+M)D)`
//! * space `O(N+M)`
//!
//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf)
//! describing it.
//!
//! The implementation of this algorithm is based on the implementation by
//! Brandon Williams.
//!
//! # Heuristics
//!
//! At present this implementation of Myers' does not implement any more advanced
//! heuristics that would solve some pathological cases. For instance passing two
//! large and completely distinct sequences to the algorithm will make it spin
//! without making reasonable progress. Currently the only protection in the
//! library against this is to pass a deadline to the diffing algorithm.
//!
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
use std::ops::{Index, IndexMut, Range};
use std::time::Instant;
use crate::algorithms::utils::{common_prefix_len, common_suffix_len, is_empty_range};
use crate::algorithms::DiffHook;
/// Myers' diff algorithm.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
pub fn diff<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
diff_deadline(d, old, old_range, new, new_range, None)
}
/// Myers' diff algorithm with deadline.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
let max_d = max_d(old_range.len(), new_range.len());
let mut vb = V::new(max_d);
let mut vf = V::new(max_d);
conquer(
d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
)?;
d.finish()
}
// A D-path is a path which starts at (0,0) that has exactly D non-diagonal
// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge
// and then a possibly empty sequence of diagonal edges called a snake.
/// `V` contains the endpoints of the furthest reaching `D-paths`. For each
/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because
/// `y` can be computed from `x - k`. In other words, `V` is an array of integers
/// where `V[k]` contains the row index of the endpoint of the furthest reaching
/// path in diagonal `k`.
///
/// We can't use a traditional Vec to represent `V` since we use `k` as an index
/// and it can take on negative values. So instead `V` is represented as a
/// light-weight wrapper around a Vec plus an `offset` which is the maximum value
/// `k` can take on in order to map negative `k`'s back to a value >= 0.
#[derive(Debug)]
struct V {
offset: isize,
v: Vec<usize>, // Look into initializing this to -1 and storing isize
}
impl V {
fn new(max_d: usize) -> Self {
Self {
offset: max_d as isize,
v: vec![0; 2 * max_d],
}
}
fn len(&self) -> usize {
self.v.len()
}
}
impl Index<isize> for V {
type Output = usize;
fn index(&self, index: isize) -> &Self::Output {
&self.v[(index + self.offset) as usize]
}
}
impl IndexMut<isize> for V {
fn index_mut(&mut self, index: isize) -> &mut Self::Output {
&mut self.v[(index + self.offset) as usize]
}
}
fn max_d(len1: usize, len2: usize) -> usize {
// XXX look into reducing the need to have the additional '+ 1'
(len1 + len2 + 1) / 2 + 1
}
#[inline(always)]
fn split_at(range: Range<usize>, at: usize) -> (Range<usize>, Range<usize>) {
(range.start..at, at..range.end)
}
/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally
/// a snake has a start end end point (and it is possible for a snake to have
/// a length of zero, meaning the start and end points are the same) however
/// we do not need the end point which is why it's not implemented here.
///
/// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes
/// some of which may be empty. The divide step requires finding the ceil(D/2) +
/// 1 or middle snake of an optimal D-path. The idea for doing so is to
/// simultaneously run the basic algorithm in both the forward and reverse
/// directions until furthest reaching forward and reverse paths starting at
/// opposing corners 'overlap'.
fn find_middle_snake<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
deadline: Option<Instant>,
) -> Option<(usize, usize)>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
let n = old_range.len();
let m = new_range.len();
// By Lemma 1 in the paper, the optimal edit script length is odd or even as
// `delta` is odd or even.
let delta = n as isize - m as isize;
let odd = delta & 1 == 1;
// The initial point at (0, -1)
vf[1] = 0;
// The initial point at (N, M+1)
vb[1] = 0;
// We only need to explore ceil(D/2) + 1
let d_max = max_d(n, m);
assert!(vf.len() >= d_max);
assert!(vb.len() >= d_max);
for d in 0..d_max as isize {
// are we running for too long?
if let Some(deadline) = deadline {
if Instant::now() > deadline {
break;
}
}
// Forward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
vf[k + 1]
} else {
vf[k - 1] + 1
};
let y = (x as isize - k) as usize;
// The coordinate of the start of a snake
let (x0, y0) = (x, y);
// While these sequences are identical, keep moving through the
// graph with no cost
if x < old_range.len() && y < new_range.len() {
let advance = common_prefix_len(
old,
old_range.start + x..old_range.end,
new,
new_range.start + y..new_range.end,
);
x += advance;
}
// This is the new best x value
vf[k] = x;
// Only check for connections from the forward search when N - M is
// odd and when there is a reciprocal k line coming from the other
// direction.
if odd && (k - delta).abs() <= (d - 1) {
// TODO optimize this so we don't have to compare against n
if vf[k] + vb[-(k - delta)] >= n {
// Return the snake
return Some((x0 + old_range.start, y0 + new_range.start));
}
}
}
// Backward path
for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) {
vb[k + 1]
} else {
vb[k - 1] + 1
};
let mut y = (x as isize - k) as usize;
// The coordinate of the start of a snake
if x < n && y < m {
let advance = common_suffix_len(
old,
old_range.start..old_range.start + n - x,
new,
new_range.start..new_range.start + m - y,
);
x += advance;
y += advance;
}
// This is the new best x value
vb[k] = x;
if !odd && (k - delta).abs() <= d {
// TODO optimize this so we don't have to compare against n
if vb[k] + vf[-(k - delta)] >= n {
// Return the snake
return Some((n - x + old_range.start, m - y + new_range.start));
}
}
}
// TODO: Maybe there's an opportunity to optimize and bail early?
}
// deadline reached
None
}
#[allow(clippy::too_many_arguments)]
fn conquer<Old, New, D>(
d: &mut D,
old: &Old,
mut old_range: Range<usize>,
new: &New,
mut new_range: Range<usize>,
vf: &mut V,
vb: &mut V,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
// Check for common prefix
let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone());
if common_prefix_len > 0 {
d.equal(old_range.start, new_range.start, common_prefix_len)?;
}
old_range.start += common_prefix_len;
new_range.start += common_prefix_len;
// Check for common suffix
let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone());
let common_suffix = (
old_range.end - common_suffix_len,
new_range.end - common_suffix_len,
);
old_range.end -= common_suffix_len;
new_range.end -= common_suffix_len;
if is_empty_range(&old_range) && is_empty_range(&new_range) {
// Do nothing
} else if is_empty_range(&new_range) {
d.delete(old_range.start, old_range.len(), new_range.start)?;
} else if is_empty_range(&old_range) {
d.insert(old_range.start, new_range.start, new_range.len())?;
} else if let Some((x_start, y_start)) = find_middle_snake(
old,
old_range.clone(),
new,
new_range.clone(),
vf,
vb,
deadline,
) {
let (old_a, old_b) = split_at(old_range, x_start);
let (new_a, new_b) = split_at(new_range, y_start);
conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
} else {
d.delete(
old_range.start,
old_range.end - old_range.start,
new_range.start,
)?;
d.insert(
old_range.start,
new_range.start,
new_range.end - new_range.start,
)?;
}
if common_suffix_len > 0 {
d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?;
}
Ok(())
}
#[test]
fn test_find_middle_snake() {
let a = &b"ABCABBA"[..];
let b = &b"CBABAC"[..];
let max_d = max_d(a.len(), b.len());
let mut vf = V::new(max_d);
let mut vb = V::new(max_d);
let (x_start, y_start) =
find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap();
assert_eq!(x_start, 4);
assert_eq!(y_start, 1);
}
#[test]
fn test_diff() {
let a: &[usize] = &[0, 1, 2, 3, 4];
let b: &[usize] = &[0, 1, 2, 9, 4];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_contiguous() {
let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5];
let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_pat() {
let a: &[usize] = &[0, 1, 3, 4, 5];
let b: &[usize] = &[0, 1, 4, 5, 8, 9];
let mut d = crate::algorithms::Capture::new();
diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.ops());
}
#[test]
fn test_deadline_reached() {
use std::ops::Index;
use std::time::Duration;
let a = (0..100).collect::<Vec<_>>();
let mut b = (0..100).collect::<Vec<_>>();
b[10] = 99;
b[50] = 99;
b[25] = 99;
struct SlowIndex<'a>(&'a [usize]);
impl<'a> Index<usize> for SlowIndex<'a> {
type Output = usize;
fn index(&self, index: usize) -> &Self::Output {
std::thread::sleep(Duration::from_millis(1));
&self.0[index]
}
}
let slow_a = SlowIndex(&a);
let slow_b = SlowIndex(&b);
// don't give it enough time to do anything interesting
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_deadline(
&mut d,
&slow_a,
0..a.len(),
&slow_b,
0..b.len(),
Some(Instant::now() + Duration::from_millis(50)),
)
.unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}
#[test]
fn test_finish_called() {
struct HasRunFinish(bool);
impl DiffHook for HasRunFinish {
type Error = ();
fn finish(&mut self) -> Result<(), Self::Error> {
self.0 = true;
Ok(())
}
}
let mut d = HasRunFinish(false);
let slice = &[1, 2];
let slice2 = &[1, 2, 3];
diff(&mut d, slice, 0..slice.len(), slice2, 0..slice2.len()).unwrap();
assert!(d.0);
let mut d = HasRunFinish(false);
let slice = &[1, 2];
diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
assert!(d.0);
let mut d = HasRunFinish(false);
let slice: &[u8] = &[];
diff(&mut d, slice, 0..slice.len(), slice, 0..slice.len()).unwrap();
assert!(d.0);
}