blob: 1442e54ca53de074fd375d7a112d84924f753c4a [file] [log] [blame]
//! ### Deviation
//!
//! Note that the algorithm implemented here is in many ways different from what `git` does.
//!
//! - it's less sophisticated and doesn't use any ranking of candidates. Instead, it picks the first possible match.
//! - the set used for copy-detection is probably smaller by default.
use std::ops::Range;
use bstr::BStr;
use gix_object::tree::{EntryKind, EntryMode};
use crate::{
blob::{platform::prepare_diff::Operation, DiffLineStats, ResourceKind},
rewrites::{CopySource, Outcome, Tracker},
Rewrites,
};
/// The kind of a change.
#[derive(Debug, Copy, Clone, Ord, PartialOrd, PartialEq, Eq)]
pub enum ChangeKind {
/// The change represents the *deletion* of an item.
Deletion,
/// The change represents the *modification* of an item.
Modification,
/// The change represents the *addition* of an item.
Addition,
}
/// A trait providing all functionality to abstract over the concept of a change, as seen by the [`Tracker`].
pub trait Change: Clone {
/// Return the hash of this change for identification.
///
/// Note that this is the id of the object as stored in `git`, i.e. it must have gone through workspace
/// conversions.
fn id(&self) -> &gix_hash::oid;
/// Return the kind of this change.
fn kind(&self) -> ChangeKind;
/// Return more information about the kind of entry affected by this change.
fn entry_mode(&self) -> EntryMode;
/// Return the id of the change along with its mode.
fn id_and_entry_mode(&self) -> (&gix_hash::oid, EntryMode);
}
/// A set of tracked items allows to figure out their relations by figuring out their similarity.
pub(crate) struct Item<T> {
/// The underlying raw change
change: T,
/// That slice into the backing for paths.
path: Range<usize>,
/// If true, this item was already emitted, i.e. seen by the caller.
emitted: bool,
}
impl<T: Change> Item<T> {
fn location<'a>(&self, backing: &'a [u8]) -> &'a BStr {
backing[self.path.clone()].as_ref()
}
fn entry_mode_compatible(&self, mode: EntryMode) -> bool {
use EntryKind::*;
matches!(
(mode.kind(), self.change.entry_mode().kind()),
(Blob | BlobExecutable, Blob | BlobExecutable) | (Link, Link)
)
}
fn is_source_for_destination_of(&self, kind: visit::SourceKind, dest_item_mode: EntryMode) -> bool {
self.entry_mode_compatible(dest_item_mode)
&& match kind {
visit::SourceKind::Rename => !self.emitted && matches!(self.change.kind(), ChangeKind::Deletion),
visit::SourceKind::Copy => {
matches!(self.change.kind(), ChangeKind::Modification)
}
}
}
}
/// A module with types used in the user-callback in [Tracker::emit()](crate::rewrites::Tracker::emit()).
pub mod visit {
use bstr::BStr;
use gix_object::tree::EntryMode;
use crate::blob::DiffLineStats;
/// The source of a rewrite, rename or copy.
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct Source<'a> {
/// The kind of entry.
pub entry_mode: EntryMode,
/// The hash of the state of the source as seen in the object database.
pub id: gix_hash::ObjectId,
/// Further specify what kind of source this is.
pub kind: SourceKind,
/// The repository-relative location of this entry.
pub location: &'a BStr,
/// If this is a rewrite, indicate how many lines would need to change to turn this source into the destination.
pub diff: Option<DiffLineStats>,
}
/// Further identify the kind of [Source].
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub enum SourceKind {
/// This is the source of an entry that was renamed, as `source` was renamed to `destination`.
Rename,
/// This is the source of a copy, as `source` was copied into `destination`.
Copy,
}
/// A change along with a location.
#[derive(Clone)]
pub struct Destination<'a, T: Clone> {
/// The change at the given `location`.
pub change: T,
/// The repository-relative location of this destination.
pub location: &'a BStr,
}
}
///
pub mod emit {
/// The error returned by [Tracker::emit()](super::Tracker::emit()).
#[derive(Debug, thiserror::Error)]
#[allow(missing_docs)]
pub enum Error {
#[error("Could not find blob for similarity checking")]
FindExistingBlob(#[from] gix_object::find::existing_object::Error),
#[error("Could not obtain exhaustive item set to use as possible sources for copy detection")]
GetItemsForExhaustiveCopyDetection(#[source] Box<dyn std::error::Error + Send + Sync>),
#[error(transparent)]
SetResource(#[from] crate::blob::platform::set_resource::Error),
#[error(transparent)]
PrepareDiff(#[from] crate::blob::platform::prepare_diff::Error),
}
}
/// Lifecycle
impl<T: Change> Tracker<T> {
/// Create a new instance with `rewrites` configuration.
pub fn new(rewrites: Rewrites) -> Self {
Tracker {
items: vec![],
path_backing: vec![],
rewrites,
}
}
}
/// build state and find matches.
impl<T: Change> Tracker<T> {
/// We may refuse the push if that information isn't needed for what we have to track.
pub fn try_push_change(&mut self, change: T, location: &BStr) -> Option<T> {
if !change.entry_mode().is_blob_or_symlink() {
return Some(change);
}
let keep = match (self.rewrites.copies, change.kind()) {
(Some(_find_copies), _) => true,
(None, ChangeKind::Modification { .. }) => false,
(None, _) => true,
};
if !keep {
return Some(change);
}
let start = self.path_backing.len();
self.path_backing.extend_from_slice(location);
self.items.push(Item {
path: start..self.path_backing.len(),
change,
emitted: false,
});
None
}
/// Can only be called once effectively as it alters its own state to assure each item is only emitted once.
///
/// `cb(destination, source)` is called for each item, either with `Some(source)` if it's
/// the destination of a copy or rename, or with `None` for source if no relation to other
/// items in the tracked set exist, which is like saying 'no rename or rewrite or copy' happened.
///
/// `objects` is used to access blob data for similarity checks if required and is taken directly from the object database.
/// Worktree filters and text conversions will be applied afterwards automatically. Note that object-caching *should not*
/// be enabled as caching is implemented by `diff_cache`, after all, the blob that's actually diffed is going
/// through conversion steps.
///
/// `diff_cache` is a way to retain a cache of resources that are prepared for rapid diffing, and it also controls
/// the diff-algorithm (provided no user-algorithm is set).
/// Note that we control a few options of `diff_cache` to assure it will ignore external commands.
/// Note that we do not control how the `diff_cache` converts resources, it's left to the caller to decide
/// if it should look at what's stored in `git`, or in the working tree, along with all diff-specific conversions.
///
/// `push_source_tree(push_fn: push(change, location))` is a function that is called when the entire tree of the source
/// should be added as modifications by calling `push` repeatedly to use for perfect copy tracking. Note that `push`
/// will panic if `change` is not a modification, and it's valid to not call `push` at all.
pub fn emit<PushSourceTreeFn, E>(
&mut self,
mut cb: impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
mut push_source_tree: PushSourceTreeFn,
) -> Result<Outcome, emit::Error>
where
PushSourceTreeFn: FnMut(&mut dyn FnMut(T, &BStr)) -> Result<(), E>,
E: std::error::Error + Send + Sync + 'static,
{
diff_cache.options.skip_internal_diff_if_external_is_configured = false;
fn by_id_and_location<T: Change>(a: &Item<T>, b: &Item<T>) -> std::cmp::Ordering {
a.change
.id()
.cmp(b.change.id())
.then_with(|| a.path.start.cmp(&b.path.start).then(a.path.end.cmp(&b.path.end)))
}
self.items.sort_by(by_id_and_location);
let mut out = Outcome {
options: self.rewrites,
..Default::default()
};
self.match_pairs_of_kind(
visit::SourceKind::Rename,
&mut cb,
self.rewrites.percentage,
&mut out,
diff_cache,
objects,
)?;
if let Some(copies) = self.rewrites.copies {
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
)?;
match copies.source {
CopySource::FromSetOfModifiedFiles => {}
CopySource::FromSetOfModifiedFilesAndAllSources => {
push_source_tree(&mut |change, location| {
assert!(
self.try_push_change(change, location).is_none(),
"we must accept every change"
);
// make sure these aren't viable to be emitted anymore.
self.items.last_mut().expect("just pushed").emitted = true;
})
.map_err(|err| emit::Error::GetItemsForExhaustiveCopyDetection(Box::new(err)))?;
self.items.sort_by(by_id_and_location);
self.match_pairs_of_kind(
visit::SourceKind::Copy,
&mut cb,
copies.percentage,
&mut out,
diff_cache,
objects,
)?;
}
}
}
self.items
.sort_by(|a, b| a.location(&self.path_backing).cmp(b.location(&self.path_backing)));
for item in self.items.drain(..).filter(|item| !item.emitted) {
if cb(
visit::Destination {
location: item.location(&self.path_backing),
change: item.change,
},
None,
) == crate::tree::visit::Action::Cancel
{
break;
}
}
Ok(out)
}
}
impl<T: Change> Tracker<T> {
fn match_pairs_of_kind(
&mut self,
kind: visit::SourceKind,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
percentage: Option<f32>,
out: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
) -> Result<(), emit::Error> {
// we try to cheaply reduce the set of possibilities first, before possibly looking more exhaustively.
let needs_second_pass = !needs_exact_match(percentage);
if self.match_pairs(cb, None /* by identity */, kind, out, diff_cache, objects)?
== crate::tree::visit::Action::Cancel
{
return Ok(());
}
if needs_second_pass {
let is_limited = if self.rewrites.limit == 0 {
false
} else {
let (num_src, num_dst) =
estimate_involved_items(self.items.iter().map(|item| (item.emitted, item.change.kind())), kind);
let permutations = num_src * num_dst;
if permutations > self.rewrites.limit {
match kind {
visit::SourceKind::Rename => {
out.num_similarity_checks_skipped_for_rename_tracking_due_to_limit = permutations;
}
visit::SourceKind::Copy => {
out.num_similarity_checks_skipped_for_copy_tracking_due_to_limit = permutations;
}
}
true
} else {
false
}
};
if !is_limited {
self.match_pairs(cb, percentage, kind, out, diff_cache, objects)?;
}
}
Ok(())
}
fn match_pairs(
&mut self,
cb: &mut impl FnMut(visit::Destination<'_, T>, Option<visit::Source<'_>>) -> crate::tree::visit::Action,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
diff_cache: &mut crate::blob::Platform,
objects: &impl gix_object::FindObjectOrHeader,
) -> Result<crate::tree::visit::Action, emit::Error> {
let mut dest_ofs = 0;
while let Some((mut dest_idx, dest)) = self.items[dest_ofs..].iter().enumerate().find_map(|(idx, item)| {
(!item.emitted && matches!(item.change.kind(), ChangeKind::Addition)).then_some((idx, item))
}) {
dest_idx += dest_ofs;
dest_ofs = dest_idx + 1;
let src = find_match(
&self.items,
dest,
dest_idx,
percentage,
kind,
stats,
objects,
diff_cache,
&self.path_backing,
)?
.map(|(src_idx, src, diff)| {
let (id, entry_mode) = src.change.id_and_entry_mode();
let id = id.to_owned();
let location = src.location(&self.path_backing);
(
visit::Source {
entry_mode,
id,
kind,
location,
diff,
},
src_idx,
)
});
if src.is_none() {
continue;
}
let location = dest.location(&self.path_backing);
let change = dest.change.clone();
let dest = visit::Destination { change, location };
self.items[dest_idx].emitted = true;
if let Some(src_idx) = src.as_ref().map(|t| t.1) {
self.items[src_idx].emitted = true;
}
if cb(dest, src.map(|t| t.0)) == crate::tree::visit::Action::Cancel {
return Ok(crate::tree::visit::Action::Cancel);
}
}
Ok(crate::tree::visit::Action::Continue)
}
}
/// Returns the amount of viable sources and destinations for `items` as eligible for the given `kind` of operation.
fn estimate_involved_items(
items: impl IntoIterator<Item = (bool, ChangeKind)>,
kind: visit::SourceKind,
) -> (usize, usize) {
items
.into_iter()
.filter(|(emitted, _)| match kind {
visit::SourceKind::Rename => !*emitted,
visit::SourceKind::Copy => true,
})
.fold((0, 0), |(mut src, mut dest), (emitted, change_kind)| {
match change_kind {
ChangeKind::Addition => {
if kind == visit::SourceKind::Rename || !emitted {
dest += 1;
}
}
ChangeKind::Deletion => {
if kind == visit::SourceKind::Rename {
src += 1
}
}
ChangeKind::Modification => {
if kind == visit::SourceKind::Copy {
src += 1
}
}
}
(src, dest)
})
}
fn needs_exact_match(percentage: Option<f32>) -> bool {
percentage.map_or(true, |p| p >= 1.0)
}
/// <`src_idx`, src, possibly diff stat>
type SourceTuple<'a, T> = (usize, &'a Item<T>, Option<DiffLineStats>);
/// Find `item` in our set of items ignoring `item_idx` to avoid finding ourselves, by similarity indicated by `percentage`.
/// The latter can be `None` or `Some(x)` where `x>=1` for identity, and anything else for similarity.
/// We also ignore emitted items entirely.
/// Use `kind` to indicate what kind of match we are looking for, which might be deletions matching an `item` addition, or
/// any non-deletion otherwise.
/// Note that we always try to find by identity first even if a percentage is given as it's much faster and may reduce the set
/// of items to be searched.
#[allow(clippy::too_many_arguments)]
fn find_match<'a, T: Change>(
items: &'a [Item<T>],
item: &Item<T>,
item_idx: usize,
percentage: Option<f32>,
kind: visit::SourceKind,
stats: &mut Outcome,
objects: &impl gix_object::FindObjectOrHeader,
diff_cache: &mut crate::blob::Platform,
path_backing: &[u8],
) -> Result<Option<SourceTuple<'a, T>>, emit::Error> {
let (item_id, item_mode) = item.change.id_and_entry_mode();
if needs_exact_match(percentage) || item_mode.is_link() {
let first_idx = items.partition_point(|a| a.change.id() < item_id);
let range = match items.get(first_idx..).map(|items| {
let end = items
.iter()
.position(|a| a.change.id() != item_id)
.map_or(items.len(), |idx| first_idx + idx);
first_idx..end
}) {
Some(range) => range,
None => return Ok(None),
};
if range.is_empty() {
return Ok(None);
}
let res = items[range.clone()].iter().enumerate().find_map(|(mut src_idx, src)| {
src_idx += range.start;
(src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode)).then_some((src_idx, src, None))
});
if let Some(src) = res {
return Ok(Some(src));
}
} else {
let mut has_new = false;
let percentage = percentage.expect("it's set to something below 1.0 and we assured this");
debug_assert!(
item_mode.is_blob(),
"symlinks are matched exactly, and trees aren't used here"
);
for (can_idx, src) in items
.iter()
.enumerate()
.filter(|(src_idx, src)| *src_idx != item_idx && src.is_source_for_destination_of(kind, item_mode))
{
if !has_new {
diff_cache.set_resource(
item_id.to_owned(),
item_mode.kind(),
item.location(path_backing),
ResourceKind::NewOrDestination,
objects,
)?;
has_new = true;
}
let (src_id, src_mode) = src.change.id_and_entry_mode();
diff_cache.set_resource(
src_id.to_owned(),
src_mode.kind(),
src.location(path_backing),
ResourceKind::OldOrSource,
objects,
)?;
let prep = diff_cache.prepare_diff()?;
stats.num_similarity_checks += 1;
match prep.operation {
Operation::InternalDiff { algorithm } => {
let tokens =
crate::blob::intern::InternedInput::new(prep.old.intern_source(), prep.new.intern_source());
let counts = crate::blob::diff(
algorithm,
&tokens,
crate::blob::sink::Counter::new(diff::Statistics {
removed_bytes: 0,
input: &tokens,
}),
);
let old_data_len = prep.old.data.as_slice().unwrap_or_default().len();
let new_data_len = prep.new.data.as_slice().unwrap_or_default().len();
let similarity = (old_data_len - counts.wrapped) as f32 / old_data_len.max(new_data_len) as f32;
if similarity >= percentage {
return Ok(Some((
can_idx,
src,
DiffLineStats {
removals: counts.removals,
insertions: counts.insertions,
before: tokens.before.len().try_into().expect("interner handles only u32"),
after: tokens.after.len().try_into().expect("interner handles only u32"),
similarity,
}
.into(),
)));
}
}
Operation::ExternalCommand { .. } => {
unreachable!("we have disabled this possibility with an option")
}
Operation::SourceOrDestinationIsBinary => {
// TODO: figure out if git does more here
}
};
}
}
Ok(None)
}
mod diff {
use std::ops::Range;
pub struct Statistics<'a, 'data> {
pub removed_bytes: usize,
pub input: &'a crate::blob::intern::InternedInput<&'data [u8]>,
}
impl<'a, 'data> crate::blob::Sink for Statistics<'a, 'data> {
type Out = usize;
fn process_change(&mut self, before: Range<u32>, _after: Range<u32>) {
self.removed_bytes = self.input.before[before.start as usize..before.end as usize]
.iter()
.map(|token| self.input.interner[*token].len())
.sum();
}
fn finish(self) -> Self::Out {
self.removed_bytes
}
}
}
#[cfg(test)]
mod estimate_involved_items {
use super::estimate_involved_items;
use crate::rewrites::tracker::{visit::SourceKind, ChangeKind};
#[test]
fn renames_count_unemitted_as_sources_and_destinations() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Deletion),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Rename),
(0, 1),
"here we only have one eligible source, hence nothing to do"
);
assert_eq!(
estimate_involved_items(items.into_iter().map(|t| (false, t.1)), SourceKind::Rename),
(2, 1),
"now we have more possibilities as renames count un-emitted deletions as source"
);
}
#[test]
fn copies_do_not_count_additions_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Addition),
(true, ChangeKind::Deletion),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(0, 1),
"one addition as source, the other isn't counted as it's emitted, nor is it considered a copy-source.\
deletions don't count"
);
}
#[test]
fn copies_count_modifications_as_sources() {
let items = [
(false, ChangeKind::Addition),
(true, ChangeKind::Modification),
(false, ChangeKind::Modification),
];
assert_eq!(
estimate_involved_items(items, SourceKind::Copy),
(2, 1),
"any modifications is a valid source, emitted or not"
);
}
}