| use filetime::FileTime; |
| |
| use crate::{entry, extension, Entry, State, Version}; |
| |
| mod entries; |
| /// |
| pub mod header; |
| |
| mod error { |
| |
| use crate::{decode, extension}; |
| |
| /// The error returned by [`State::from_bytes()`][crate::State::from_bytes()]. |
| #[derive(Debug, thiserror::Error)] |
| #[allow(missing_docs)] |
| pub enum Error { |
| #[error(transparent)] |
| Header(#[from] decode::header::Error), |
| #[error("Could not parse entry at index {index}")] |
| Entry { index: u32 }, |
| #[error("Mandatory extension wasn't implemented or malformed.")] |
| Extension(#[from] extension::decode::Error), |
| #[error("Index trailer should have been {expected} bytes long, but was {actual}")] |
| UnexpectedTrailerLength { expected: usize, actual: usize }, |
| #[error("Shared index checksum was {actual_checksum} but should have been {expected_checksum}")] |
| ChecksumMismatch { |
| actual_checksum: gix_hash::ObjectId, |
| expected_checksum: gix_hash::ObjectId, |
| }, |
| } |
| } |
| pub use error::Error; |
| use gix_features::parallel::InOrderIter; |
| |
| use crate::util::read_u32; |
| |
| /// Options to define how to decode an index state [from bytes][State::from_bytes()]. |
| #[derive(Default, Clone, Copy)] |
| pub struct Options { |
| /// If Some(_), we are allowed to use more than one thread. If Some(N), use no more than N threads. If Some(0)|None, use as many threads |
| /// as there are logical cores. |
| /// |
| /// This applies to loading extensions in parallel to entries if the common EOIE extension is available. |
| /// It also allows to use multiple threads for loading entries if the IEOT extension is present. |
| pub thread_limit: Option<usize>, |
| /// The minimum size in bytes to load extensions in their own thread, assuming there is enough `num_threads` available. |
| /// If set to 0, for example, extensions will always be read in their own thread if enough threads are available. |
| pub min_extension_block_in_bytes_for_threading: usize, |
| /// Set the expected hash of this index if we are read as part of a `link` extension. |
| /// |
| /// We will abort reading this file if it doesn't match. |
| pub expected_checksum: Option<gix_hash::ObjectId>, |
| } |
| |
| impl State { |
| /// Decode an index state from `data` and store `timestamp` in the resulting instance for pass-through, assuming `object_hash` |
| /// to be used through the file. Also return the stored hash over all bytes in `data` or `None` if none was written due to `index.skipHash`. |
| pub fn from_bytes( |
| data: &[u8], |
| timestamp: FileTime, |
| object_hash: gix_hash::Kind, |
| Options { |
| thread_limit, |
| min_extension_block_in_bytes_for_threading, |
| expected_checksum, |
| }: Options, |
| ) -> Result<(Self, Option<gix_hash::ObjectId>), Error> { |
| let _span = gix_features::trace::detail!("gix_index::State::from_bytes()"); |
| let (version, num_entries, post_header_data) = header::decode(data, object_hash)?; |
| let start_of_extensions = extension::end_of_index_entry::decode(data, object_hash); |
| |
| let mut num_threads = gix_features::parallel::num_threads(thread_limit); |
| let path_backing_buffer_size = entries::estimate_path_storage_requirements_in_bytes( |
| num_entries, |
| data.len(), |
| start_of_extensions, |
| object_hash, |
| version, |
| ); |
| |
| let (entries, ext, data) = match start_of_extensions { |
| Some(offset) if num_threads > 1 => { |
| let extensions_data = &data[offset..]; |
| let index_offsets_table = extension::index_entry_offset_table::find(extensions_data, object_hash); |
| let (entries_res, ext_res) = gix_features::parallel::threads(|scope| { |
| let extension_loading = |
| (extensions_data.len() > min_extension_block_in_bytes_for_threading).then({ |
| num_threads -= 1; |
| || { |
| gix_features::parallel::build_thread() |
| .name("gix-index.from_bytes.load-extensions".into()) |
| .spawn_scoped(scope, || extension::decode::all(extensions_data, object_hash)) |
| .expect("valid name") |
| } |
| }); |
| let entries_res = match index_offsets_table { |
| Some(entry_offsets) => { |
| let chunk_size = (entry_offsets.len() as f32 / num_threads as f32).ceil() as usize; |
| let num_chunks = entry_offsets.chunks(chunk_size).count(); |
| let mut threads = Vec::with_capacity(num_chunks); |
| for (id, chunks) in entry_offsets.chunks(chunk_size).enumerate() { |
| let chunks = chunks.to_vec(); |
| threads.push( |
| gix_features::parallel::build_thread() |
| .name(format!("gix-index.from_bytes.read-entries.{id}")) |
| .spawn_scoped(scope, move || { |
| let num_entries_for_chunks = |
| chunks.iter().map(|c| c.num_entries).sum::<u32>() as usize; |
| let mut entries = Vec::with_capacity(num_entries_for_chunks); |
| let path_backing_buffer_size_for_chunks = |
| entries::estimate_path_storage_requirements_in_bytes( |
| num_entries_for_chunks as u32, |
| data.len() / num_chunks, |
| start_of_extensions.map(|ofs| ofs / num_chunks), |
| object_hash, |
| version, |
| ); |
| let mut path_backing = |
| Vec::with_capacity(path_backing_buffer_size_for_chunks); |
| let mut is_sparse = false; |
| for offset in chunks { |
| let ( |
| entries::Outcome { |
| is_sparse: chunk_is_sparse, |
| }, |
| _data, |
| ) = entries::chunk( |
| &data[offset.from_beginning_of_file as usize..], |
| &mut entries, |
| &mut path_backing, |
| offset.num_entries, |
| object_hash, |
| version, |
| )?; |
| is_sparse |= chunk_is_sparse; |
| } |
| Ok::<_, Error>(( |
| id, |
| EntriesOutcome { |
| entries, |
| path_backing, |
| is_sparse, |
| }, |
| )) |
| }) |
| .expect("valid name"), |
| ); |
| } |
| let mut results = |
| InOrderIter::from(threads.into_iter().map(|thread| thread.join().unwrap())); |
| let mut acc = results.next().expect("have at least two results, one per thread"); |
| // We explicitly don't adjust the reserve in acc and rather allow for more copying |
| // to happens as vectors grow to keep the peak memory size low. |
| // NOTE: one day, we might use a memory pool for paths. We could encode the block of memory |
| // in some bytes in the path offset. That way there is more indirection/slower access |
| // to the path, but it would save time here. |
| // As it stands, `git` is definitely more efficient at this and probably uses less memory too. |
| // Maybe benchmarks can tell if that is noticeable later at 200/400GB/s memory bandwidth, or maybe just |
| // 100GB/s on a single core. |
| while let (Ok(lhs), Some(res)) = (acc.as_mut(), results.next()) { |
| match res { |
| Ok(rhs) => { |
| lhs.is_sparse |= rhs.is_sparse; |
| let ofs = lhs.path_backing.len(); |
| lhs.path_backing.extend(rhs.path_backing); |
| lhs.entries.extend(rhs.entries.into_iter().map(|mut e| { |
| e.path.start += ofs; |
| e.path.end += ofs; |
| e |
| })); |
| } |
| Err(err) => { |
| acc = Err(err); |
| } |
| } |
| } |
| acc.map(|acc| (acc, &data[data.len() - object_hash.len_in_bytes()..])) |
| } |
| None => entries( |
| post_header_data, |
| path_backing_buffer_size, |
| num_entries, |
| object_hash, |
| version, |
| ), |
| }; |
| let ext_res = extension_loading.map_or_else( |
| || extension::decode::all(extensions_data, object_hash), |
| |thread| thread.join().unwrap(), |
| ); |
| (entries_res, ext_res) |
| }); |
| let (ext, data) = ext_res?; |
| (entries_res?.0, ext, data) |
| } |
| None | Some(_) => { |
| let (entries, data) = entries( |
| post_header_data, |
| path_backing_buffer_size, |
| num_entries, |
| object_hash, |
| version, |
| )?; |
| let (ext, data) = extension::decode::all(data, object_hash)?; |
| (entries, ext, data) |
| } |
| }; |
| |
| if data.len() != object_hash.len_in_bytes() { |
| return Err(Error::UnexpectedTrailerLength { |
| expected: object_hash.len_in_bytes(), |
| actual: data.len(), |
| }); |
| } |
| |
| let checksum = gix_hash::ObjectId::from_bytes_or_panic(data); |
| let checksum = (!checksum.is_null()).then_some(checksum); |
| if let Some((expected_checksum, actual_checksum)) = expected_checksum.zip(checksum) { |
| if actual_checksum != expected_checksum { |
| return Err(Error::ChecksumMismatch { |
| actual_checksum, |
| expected_checksum, |
| }); |
| } |
| } |
| let EntriesOutcome { |
| entries, |
| path_backing, |
| mut is_sparse, |
| } = entries; |
| let extension::decode::Outcome { |
| tree, |
| link, |
| resolve_undo, |
| untracked, |
| fs_monitor, |
| is_sparse: is_sparse_from_ext, // a marker is needed in case there are no directories |
| } = ext; |
| is_sparse |= is_sparse_from_ext; |
| |
| Ok(( |
| State { |
| object_hash, |
| timestamp, |
| version, |
| entries, |
| path_backing, |
| is_sparse, |
| |
| tree, |
| link, |
| resolve_undo, |
| untracked, |
| fs_monitor, |
| }, |
| checksum, |
| )) |
| } |
| } |
| |
| struct EntriesOutcome { |
| pub entries: Vec<Entry>, |
| pub path_backing: Vec<u8>, |
| pub is_sparse: bool, |
| } |
| |
| fn entries( |
| post_header_data: &[u8], |
| path_backing_buffer_size: usize, |
| num_entries: u32, |
| object_hash: gix_hash::Kind, |
| version: Version, |
| ) -> Result<(EntriesOutcome, &[u8]), Error> { |
| let mut entries = Vec::with_capacity(num_entries as usize); |
| let mut path_backing = Vec::with_capacity(path_backing_buffer_size); |
| entries::chunk( |
| post_header_data, |
| &mut entries, |
| &mut path_backing, |
| num_entries, |
| object_hash, |
| version, |
| ) |
| .map(|(entries::Outcome { is_sparse }, data): (entries::Outcome, &[u8])| { |
| ( |
| EntriesOutcome { |
| entries, |
| path_backing, |
| is_sparse, |
| }, |
| data, |
| ) |
| }) |
| } |
| |
| pub(crate) fn stat(data: &[u8]) -> Option<(entry::Stat, &[u8])> { |
| let (ctime_secs, data) = read_u32(data)?; |
| let (ctime_nsecs, data) = read_u32(data)?; |
| let (mtime_secs, data) = read_u32(data)?; |
| let (mtime_nsecs, data) = read_u32(data)?; |
| let (dev, data) = read_u32(data)?; |
| let (ino, data) = read_u32(data)?; |
| let (uid, data) = read_u32(data)?; |
| let (gid, data) = read_u32(data)?; |
| let (size, data) = read_u32(data)?; |
| Some(( |
| entry::Stat { |
| mtime: entry::stat::Time { |
| secs: ctime_secs, |
| nsecs: ctime_nsecs, |
| }, |
| ctime: entry::stat::Time { |
| secs: mtime_secs, |
| nsecs: mtime_nsecs, |
| }, |
| dev, |
| ino, |
| uid, |
| gid, |
| size, |
| }, |
| data, |
| )) |
| } |