Snap for 10453563 from a00441b1ee7ea275114309268c00e3b701b118f7 to mainline-permission-release

Change-Id: If2d27eac261945f37eb7cacb515f04e62a655065
diff --git a/.cargo_vcs_info.json b/.cargo_vcs_info.json
index c8aa4e4..34dd103 100644
--- a/.cargo_vcs_info.json
+++ b/.cargo_vcs_info.json
@@ -1,5 +1,6 @@
 {
   "git": {
-    "sha1": "ebcb09b1dc53211c6b5abdf4dc5b40e4bcd0a965"
-  }
-}
+    "sha1": "6236214d717694917e77aa1c16d91176b9bc2fff"
+  },
+  "path_in_vcs": ""
+}
\ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 1f36aa2..1a7835a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -42,16 +42,17 @@
     host_supported: true,
     crate_name: "rayon",
     cargo_env_compat: true,
-    cargo_pkg_version: "1.5.1",
+    cargo_pkg_version: "1.7.0",
     srcs: ["src/lib.rs"],
-    edition: "2018",
-    cfgs: [
-        "min_const_generics",
-        "step_by",
-    ],
+    edition: "2021",
     rustlibs: [
-        "libcrossbeam_deque",
         "libeither",
         "librayon_core",
     ],
+    apex_available: [
+        "//apex_available:platform",
+        "//apex_available:anyapex",
+    ],
+    product_available: true,
+    vendor_available: true,
 }
diff --git a/Cargo.toml b/Cargo.toml
index 3b8921d..9b0e7b6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,49 +3,50 @@
 # When uploading crates to the registry Cargo will automatically
 # "normalize" Cargo.toml files for maximal compatibility
 # with all versions of Cargo and also rewrite `path` dependencies
-# to registry (e.g., crates.io) dependencies
+# to registry (e.g., crates.io) dependencies.
 #
-# If you believe there's an error in this file please file an
-# issue against the rust-lang/cargo repository. If you're
-# editing this file be aware that the upstream Cargo.toml
-# will likely look very different (and much more reasonable)
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
 
 [package]
-edition = "2018"
+edition = "2021"
+rust-version = "1.59"
 name = "rayon"
-version = "1.5.1"
-authors = ["Niko Matsakis <niko@alum.mit.edu>", "Josh Stone <cuviper@gmail.com>"]
-exclude = ["/ci/*", "/scripts/*", "/.github/*", "/bors.toml"]
+version = "1.7.0"
+authors = [
+    "Niko Matsakis <niko@alum.mit.edu>",
+    "Josh Stone <cuviper@gmail.com>",
+]
+exclude = [
+    "/ci/*",
+    "/scripts/*",
+    "/.github/*",
+    "/bors.toml",
+]
 description = "Simple work-stealing parallelism for Rust"
 documentation = "https://docs.rs/rayon/"
 readme = "README.md"
-keywords = ["parallel", "thread", "concurrency", "join", "performance"]
+keywords = [
+    "parallel",
+    "thread",
+    "concurrency",
+    "join",
+    "performance",
+]
 categories = ["concurrency"]
-license = "Apache-2.0/MIT"
+license = "MIT OR Apache-2.0"
 repository = "https://github.com/rayon-rs/rayon"
-[dependencies.crossbeam-deque]
-version = "0.8.0"
 
 [dependencies.either]
 version = "1.0"
 default-features = false
 
 [dependencies.rayon-core]
-version = "1.9.1"
-[dev-dependencies.docopt]
-version = "1"
-
-[dev-dependencies.lazy_static]
-version = "1"
+version = "1.11.0"
 
 [dev-dependencies.rand]
 version = "0.8"
 
 [dev-dependencies.rand_xorshift]
 version = "0.3"
-
-[dev-dependencies.serde]
-version = "1.0.85"
-features = ["derive"]
-[build-dependencies.autocfg]
-version = "1"
diff --git a/Cargo.toml.orig b/Cargo.toml.orig
index 6ab9dda..a6ccc97 100644
--- a/Cargo.toml.orig
+++ b/Cargo.toml.orig
@@ -1,12 +1,12 @@
 [package]
 name = "rayon"
-# Reminder to update html_rool_url in lib.rs when updating version
-version = "1.5.1"
+version = "1.7.0"
 authors = ["Niko Matsakis <niko@alum.mit.edu>",
            "Josh Stone <cuviper@gmail.com>"]
 description = "Simple work-stealing parallelism for Rust"
-edition = "2018"
-license = "Apache-2.0/MIT"
+rust-version = "1.59"
+edition = "2021"
+license = "MIT OR Apache-2.0"
 repository = "https://github.com/rayon-rs/rayon"
 documentation = "https://docs.rs/rayon/"
 readme = "README.md"
@@ -19,8 +19,7 @@
 exclude = ["ci"]
 
 [dependencies]
-rayon-core = { version = "1.9.1", path = "rayon-core" }
-crossbeam-deque = "0.8.0"
+rayon-core = { version = "1.11.0", path = "rayon-core" }
 
 # This is a public dependency!
 [dependencies.either]
@@ -28,14 +27,5 @@
 default-features = false
 
 [dev-dependencies]
-docopt = "1"
-lazy_static = "1"
 rand = "0.8"
 rand_xorshift = "0.3"
-
-[dev-dependencies.serde]
-version = "1.0.85"
-features = ["derive"]
-
-[build-dependencies]
-autocfg = "1"
diff --git a/METADATA b/METADATA
index ddf9f80..94b529f 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,7 @@
+# This project was upgraded with external_updater.
+# Usage: tools/external_updater/updater.sh update rust/crates/rayon
+# For more info, check https://cs.android.com/android/platform/superproject/+/master:tools/external_updater/README.md
+
 name: "rayon"
 description: "Simple work-stealing parallelism for Rust"
 third_party {
@@ -7,13 +11,13 @@
   }
   url {
     type: ARCHIVE
-    value: "https://static.crates.io/crates/rayon/rayon-1.5.1.crate"
+    value: "https://static.crates.io/crates/rayon/rayon-1.7.0.crate"
   }
-  version: "1.5.1"
+  version: "1.7.0"
   license_type: NOTICE
   last_upgrade_date {
-    year: 2021
-    month: 5
-    day: 19
+    year: 2023
+    month: 4
+    day: 3
   }
 }
diff --git a/README.md b/README.md
index 4c4dff6..7f925bc 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 [![Rayon crate](https://img.shields.io/crates/v/rayon.svg)](https://crates.io/crates/rayon)
 [![Rayon documentation](https://docs.rs/rayon/badge.svg)](https://docs.rs/rayon)
-![minimum rustc 1.36](https://img.shields.io/badge/rustc-1.36+-red.svg)
+![minimum rustc 1.59](https://img.shields.io/badge/rustc-1.59+-red.svg)
 [![build status](https://github.com/rayon-rs/rayon/workflows/master/badge.svg)](https://github.com/rayon-rs/rayon/actions)
 [![Join the chat at https://gitter.im/rayon-rs/Lobby](https://badges.gitter.im/rayon-rs/Lobby.svg)](https://gitter.im/rayon-rs/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
@@ -13,7 +13,7 @@
 and details about how it works, or [this video][video], from the Rust
 Belt Rust conference.) Rayon is
 [available on crates.io](https://crates.io/crates/rayon), and
-[API Documentation is available on docs.rs](https://docs.rs/rayon/).
+[API documentation is available on docs.rs](https://docs.rs/rayon).
 
 [blog]: https://smallcultfollowing.com/babysteps/blog/2015/12/18/rayon-data-parallelism-in-rust/
 [video]: https://www.youtube.com/watch?v=gof_OEv71Aw
@@ -71,12 +71,12 @@
 
 ```toml
 [dependencies]
-rayon = "1.5"
+rayon = "1.7"
 ```
 
-To use the Parallel Iterator APIs, a number of traits have to be in
+To use the parallel iterator APIs, a number of traits have to be in
 scope. The easiest way to bring those things into scope is to use the
-[Rayon prelude](https://docs.rs/rayon/*/rayon/prelude/index.html).  In
+[Rayon prelude](https://docs.rs/rayon/*/rayon/prelude/index.html). In
 each module where you would like to use the parallel iterator APIs,
 just add:
 
@@ -84,17 +84,37 @@
 use rayon::prelude::*;
 ```
 
-Rayon currently requires `rustc 1.36.0` or greater.
+Rayon currently requires `rustc 1.59.0` or greater.
+
+### Usage with WebAssembly
+
+Rayon can work on the Web via WebAssembly, but requires an adapter and
+some project configuration to account for differences between
+WebAssembly threads and threads on the other platforms.
+
+Check out the
+[wasm-bindgen-rayon](https://github.com/GoogleChromeLabs/wasm-bindgen-rayon)
+docs for more details.
 
 ## Contribution
 
-Rayon is an open source project! If you'd like to contribute to Rayon, check out [the list of "help wanted" issues](https://github.com/rayon-rs/rayon/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22). These are all (or should be) issues that are suitable for getting started, and they generally include a detailed set of instructions for what to do. Please ask questions if anything is unclear! Also, check out the [Guide to Development](https://github.com/rayon-rs/rayon/wiki/Guide-to-Development) page on the wiki. Note that all code submitted in PRs to Rayon is assumed to [be licensed under Rayon's dual MIT/Apache2 licensing](https://github.com/rayon-rs/rayon/blob/master/README.md#license).
+Rayon is an open source project! If you'd like to contribute to Rayon,
+check out
+[the list of "help wanted" issues](https://github.com/rayon-rs/rayon/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22).
+These are all (or should be) issues that are suitable for getting
+started, and they generally include a detailed set of instructions for
+what to do. Please ask questions if anything is unclear! Also, check
+out the
+[Guide to Development](https://github.com/rayon-rs/rayon/wiki/Guide-to-Development)
+page on the wiki. Note that all code submitted in PRs to Rayon is
+assumed to
+[be licensed under Rayon's dual MIT/Apache 2.0 licensing](https://github.com/rayon-rs/rayon/blob/master/README.md#license).
 
 ## Quick demo
 
 To see Rayon in action, check out the `rayon-demo` directory, which
 includes a number of demos of code using Rayon. For example, run this
-command to get a visualization of an nbody simulation. To see the
+command to get a visualization of an N-body simulation. To see the
 effect of using Rayon, press `s` to run sequentially and `p` to run in
 parallel.
 
@@ -120,5 +140,5 @@
 
 Rayon is distributed under the terms of both the MIT license and the
 Apache License (Version 2.0). See [LICENSE-APACHE](LICENSE-APACHE) and
-[LICENSE-MIT](LICENSE-MIT) for details. Opening a pull requests is
+[LICENSE-MIT](LICENSE-MIT) for details. Opening a pull request is
 assumed to signal agreement with these licensing terms.
diff --git a/RELEASES.md b/RELEASES.md
index 0299436..28b476d 100644
--- a/RELEASES.md
+++ b/RELEASES.md
@@ -1,3 +1,106 @@
+# Release rayon 1.7.0 / rayon-core 1.11.0 (2023-03-03)
+
+- The minimum supported `rustc` is now 1.59.
+- Added a fallback when threading is unsupported.
+- The new `ParallelIterator::take_any` and `skip_any` methods work like
+  unordered `IndexedParallelIterator::take` and `skip`, counting items in
+  whatever order they are visited in parallel.
+- The new `ParallelIterator::take_any_while` and `skip_any_while` methods work
+  like unordered `Iterator::take_while` and `skip_while`, which previously had
+  no parallel equivalent. The "while" condition may be satisfied from anywhere
+  in the parallel iterator, affecting all future items regardless of position.
+- The new `yield_now` and `yield_local` functions will cooperatively yield
+  execution to Rayon, either trying to execute pending work from the entire
+  pool or from just the local deques of the current thread, respectively.
+
+# Release rayon-core 1.10.2 (2023-01-22)
+
+- Fixed miri-reported UB for SharedReadOnly tags protected by a call.
+
+# Release rayon 1.6.1 (2022-12-09)
+
+- Simplified `par_bridge` to only pull one item at a time from the iterator,
+  without batching. Threads that are waiting for iterator items will now block
+  appropriately rather than spinning CPU. (Thanks @njaard!)
+- Added protection against recursion in `par_bridge`, so iterators that also
+  invoke rayon will not cause mutex recursion deadlocks.
+
+# Release rayon-core 1.10.1 (2022-11-18)
+
+- Fixed a race condition with threads going to sleep while a broadcast starts.
+
+# Release rayon 1.6.0 / rayon-core 1.10.0 (2022-11-18)
+
+- The minimum supported `rustc` is now 1.56.
+- The new `IndexedParallelIterator::fold_chunks` and `fold_chunks_with` methods
+  work like `ParallelIterator::fold` and `fold_with` with fixed-size chunks of
+  items. This may be useful for predictable batching performance, without the
+  allocation overhead of `IndexedParallelIterator::chunks`.
+- New "broadcast" methods run a given function on all threads in the pool.
+  These run at a sort of reduced priority after each thread has exhausted their
+  local work queue, but before they attempt work-stealing from other threads.
+  - The global `broadcast` function and `ThreadPool::broadcast` method will
+    block until completion, returning a `Vec` of all return values.
+  - The global `spawn_broadcast` function and methods on `ThreadPool`, `Scope`,
+    and `ScopeFifo` will run detached, without blocking the current thread.
+- Panicking methods now use `#[track_caller]` to report the caller's location.
+- Fixed a truncated length in `vec::Drain` when given an empty range.
+
+## Contributors
+
+Thanks to all of the contributors for this release!
+
+- @cuviper
+- @idanmuze
+- @JoeyBF
+- @JustForFun88
+- @kianmeng
+- @kornelski
+- @ritchie46
+- @ryanrussell
+- @steffahn
+- @TheIronBorn
+- @willcrozi
+
+# Release rayon 1.5.3 (2022-05-13)
+
+- The new `ParallelSliceMut::par_sort_by_cached_key` is a stable sort that caches
+  the keys for each item -- a parallel version of `slice::sort_by_cached_key`.
+
+# Release rayon-core 1.9.3 (2022-05-13)
+
+- Fixed a use-after-free race in job notification.
+
+# Release rayon 1.5.2 / rayon-core 1.9.2 (2022-04-13)
+
+- The new `ParallelSlice::par_rchunks()` and `par_rchunks_exact()` iterate
+  slice chunks in reverse, aligned the against the end of the slice if the
+  length is not a perfect multiple of the chunk size. The new
+  `ParallelSliceMut::par_rchunks_mut()` and `par_rchunks_exact_mut()` are the
+  same for mutable slices.
+- The `ParallelIterator::try_*` methods now support `std::ops::ControlFlow` and
+  `std::task::Poll` items, mirroring the unstable `Try` implementations in the
+  standard library.
+- The `ParallelString` pattern-based methods now support `&[char]` patterns,
+  which match when any character in that slice is found in the string.
+- A soft limit is now enforced on the number of threads allowed in a single
+  thread pool, respecting internal bit limits that already existed. The current
+  maximum is publicly available from the new function `max_num_threads()`.
+- Fixed several Stacked Borrow and provenance issues found by `cargo miri`.
+
+## Contributors
+
+Thanks to all of the contributors for this release!
+
+- @atouchet
+- @bluss
+- @cuviper
+- @fzyzcjy
+- @nyanzebra
+- @paolobarbolini
+- @RReverser
+- @saethlin
+
 # Release rayon 1.5.1 / rayon-core 1.9.1 (2021-05-18)
 
 - The new `in_place_scope` and `in_place_scope_fifo` are variations of `scope`
diff --git a/TEST_MAPPING b/TEST_MAPPING
index 3cbd48d..55384f0 100644
--- a/TEST_MAPPING
+++ b/TEST_MAPPING
@@ -5,6 +5,9 @@
       "path": "external/rust/crates/base64"
     },
     {
+      "path": "external/rust/crates/hashbrown"
+    },
+    {
       "path": "external/rust/crates/tinytemplate"
     },
     {
diff --git a/build.rs b/build.rs
deleted file mode 100644
index e52e326..0000000
--- a/build.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-fn main() {
-    let ac = autocfg::new();
-    if ac.probe_expression("(0..10).step_by(2).rev()") {
-        autocfg::emit("step_by");
-    }
-    if ac.probe_expression("{ fn foo<const N: usize>() {} }") {
-        autocfg::emit("min_const_generics");
-    }
-}
diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index ec08a4d..0000000
--- a/examples/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-We use this directory for interactive tests that can't be run in an
-automatic fashion. For examples of how to use Rayon, or benchmarks,
-see `rayon-demo`.
diff --git a/examples/cpu_monitor.rs b/examples/cpu_monitor.rs
deleted file mode 100644
index bbe1316..0000000
--- a/examples/cpu_monitor.rs
+++ /dev/null
@@ -1,81 +0,0 @@
-use docopt::Docopt;
-use std::io;
-use std::process;
-
-const USAGE: &str = "
-Usage: cpu_monitor [options] <scenario>
-       cpu_monitor --help
-
-A test for monitoring how much CPU usage Rayon consumes under various
-scenarios. This test is intended to be executed interactively, like so:
-
-    cargo run --example cpu_monitor -- tasks_ended
-
-The list of scenarios you can try are as follows:
-
-- tasks_ended: after all tasks have finished, go to sleep
-- task_stall_root: a root task stalls for a very long time
-- task_stall_scope: a task in a scope stalls for a very long time
-
-Options:
-    -h, --help                   Show this message.
-    -d N, --depth N              Control how hard the dummy task works [default: 27]
-";
-
-#[derive(serde::Deserialize)]
-pub struct Args {
-    arg_scenario: String,
-    flag_depth: usize,
-}
-
-fn main() {
-    let args: &Args = &Docopt::new(USAGE)
-        .and_then(|d| d.deserialize())
-        .unwrap_or_else(|e| e.exit());
-
-    match &args.arg_scenario[..] {
-        "tasks_ended" => tasks_ended(args),
-        "task_stall_root" => task_stall_root(args),
-        "task_stall_scope" => task_stall_scope(args),
-        _ => {
-            println!("unknown scenario: `{}`", args.arg_scenario);
-            println!("try --help");
-            process::exit(1);
-        }
-    }
-}
-
-fn wait_for_user() {
-    let mut input = String::new();
-    io::stdin().read_line(&mut input).unwrap();
-}
-
-fn task(args: &Args) {
-    fn join_recursively(n: usize) {
-        if n == 0 {
-            return;
-        }
-        rayon::join(|| join_recursively(n - 1), || join_recursively(n - 1));
-    }
-
-    println!("Starting heavy work at depth {}...wait.", args.flag_depth);
-    join_recursively(args.flag_depth);
-    println!("Heavy work done; check top. You should see CPU usage drop to zero soon.");
-    println!("Press <enter> to quit...");
-}
-
-fn tasks_ended(args: &Args) {
-    task(args);
-    wait_for_user();
-}
-
-fn task_stall_root(args: &Args) {
-    rayon::join(|| task(args), wait_for_user);
-}
-
-fn task_stall_scope(args: &Args) {
-    rayon::scope(|scope| {
-        scope.spawn(move |_| task(args));
-        scope.spawn(move |_| wait_for_user());
-    });
-}
diff --git a/src/array.rs b/src/array.rs
index 7922cd6..32a5fdd 100644
--- a/src/array.rs
+++ b/src/array.rs
@@ -1,11 +1,8 @@
-#![cfg(min_const_generics)]
 //! Parallel iterator types for [arrays] (`[T; N]`)
 //!
 //! You will rarely need to interact with this module directly unless you need
 //! to name one of the iterator types.
 //!
-//! Everything in this module requires const generics, stabilized in Rust 1.51.
-//!
 //! [arrays]: https://doc.rust-lang.org/std/primitive.array.html
 
 use crate::iter::plumbing::*;
@@ -14,7 +11,6 @@
 use crate::vec::DrainProducer;
 use std::mem::ManuallyDrop;
 
-/// This implementation requires const generics, stabilized in Rust 1.51.
 impl<'data, T: Sync + 'data, const N: usize> IntoParallelIterator for &'data [T; N] {
     type Item = &'data T;
     type Iter = Iter<'data, T>;
@@ -24,7 +20,6 @@
     }
 }
 
-/// This implementation requires const generics, stabilized in Rust 1.51.
 impl<'data, T: Send + 'data, const N: usize> IntoParallelIterator for &'data mut [T; N] {
     type Item = &'data mut T;
     type Iter = IterMut<'data, T>;
@@ -34,7 +29,6 @@
     }
 }
 
-/// This implementation requires const generics, stabilized in Rust 1.51.
 impl<T: Send, const N: usize> IntoParallelIterator for [T; N] {
     type Item = T;
     type Iter = IntoIter<T, N>;
@@ -84,7 +78,8 @@
         unsafe {
             // Drain every item, and then the local array can just fall out of scope.
             let mut array = ManuallyDrop::new(self.array);
-            callback.callback(DrainProducer::new(&mut *array))
+            let producer = DrainProducer::new(array.as_mut_slice());
+            callback.callback(producer)
         }
     }
 }
diff --git a/src/collections/mod.rs b/src/collections/mod.rs
index d9b7988..3c99f32 100644
--- a/src/collections/mod.rs
+++ b/src/collections/mod.rs
@@ -56,7 +56,7 @@
         pub(super) fn new(collection: &'a mut C) -> Self {
             Self {
                 // Temporarily steal the inner `Vec` so we can drain in place.
-                vec: Vec::from(mem::replace(collection, C::default())),
+                vec: Vec::from(mem::take(collection)),
                 collection,
             }
         }
@@ -65,7 +65,7 @@
     impl<'a, T, C: From<Vec<T>>> Drop for DrainGuard<'a, T, C> {
         fn drop(&mut self) {
             // Restore the collection from the `Vec` with its original capacity.
-            *self.collection = C::from(mem::replace(&mut self.vec, Vec::new()));
+            *self.collection = C::from(mem::take(&mut self.vec));
         }
     }
 
diff --git a/src/compile_fail/cannot_collect_filtermap_data.rs b/src/compile_fail/cannot_collect_filtermap_data.rs
index 65ff875..bb62ce0 100644
--- a/src/compile_fail/cannot_collect_filtermap_data.rs
+++ b/src/compile_fail/cannot_collect_filtermap_data.rs
@@ -5,12 +5,10 @@
 // zip requires data of exact size, but filter yields only bounded
 // size, so check that we cannot apply it.
 
-fn main() {
-    let a: Vec<usize> = (0..1024).collect();
-    let mut v = vec![];
-    a.par_iter()
-     .filter_map(|&x| Some(x as f32))
-     .collect_into_vec(&mut v); //~ ERROR no method
-}
+let a: Vec<usize> = (0..1024).collect();
+let mut v = vec![];
+a.par_iter()
+    .filter_map(|&x| Some(x as f32))
+    .collect_into_vec(&mut v); //~ ERROR no method
 
 ``` */
diff --git a/src/compile_fail/cannot_zip_filtered_data.rs b/src/compile_fail/cannot_zip_filtered_data.rs
index de43fca..54fd50d 100644
--- a/src/compile_fail/cannot_zip_filtered_data.rs
+++ b/src/compile_fail/cannot_zip_filtered_data.rs
@@ -5,12 +5,10 @@
 // zip requires data of exact size, but filter yields only bounded
 // size, so check that we cannot apply it.
 
-fn main() {
-    let mut a: Vec<usize> = (0..1024).rev().collect();
-    let b: Vec<usize> = (0..1024).collect();
+let mut a: Vec<usize> = (0..1024).rev().collect();
+let b: Vec<usize> = (0..1024).collect();
 
-    a.par_iter()
-     .zip(b.par_iter().filter(|&&x| x > 3)); //~ ERROR
-}
+a.par_iter()
+    .zip(b.par_iter().filter(|&&x| x > 3)); //~ ERROR
 
 ``` */
diff --git a/src/compile_fail/cell_par_iter.rs b/src/compile_fail/cell_par_iter.rs
index 4af04b1..98a1cf9 100644
--- a/src/compile_fail/cell_par_iter.rs
+++ b/src/compile_fail/cell_par_iter.rs
@@ -5,11 +5,9 @@
 use rayon::prelude::*;
 use std::cell::Cell;
 
-fn main() {
-    let c = Cell::new(42_i32);
-    (0_i32..1024).into_par_iter()
-             .map(|_| c.get()) //~ ERROR E0277
-             .min();
-}
+let c = Cell::new(42_i32);
+(0_i32..1024).into_par_iter()
+    .map(|_| c.get()) //~ ERROR E0277
+    .min();
 
 ``` */
diff --git a/src/compile_fail/must_use.rs b/src/compile_fail/must_use.rs
index ac50a62..b4b3d77 100644
--- a/src/compile_fail/must_use.rs
+++ b/src/compile_fail/must_use.rs
@@ -33,6 +33,8 @@
     step_by             /** v.par_iter().step_by(2); */
     chain               /** v.par_iter().chain(&v); */
     chunks              /** v.par_iter().chunks(2); */
+    fold_chunks         /** v.par_iter().fold_chunks(2, || 0, |x, _| x); */
+    fold_chunks_with    /** v.par_iter().fold_chunks_with(2, 0, |x, _| x); */
     cloned              /** v.par_iter().cloned(); */
     copied              /** v.par_iter().copied(); */
     enumerate           /** v.par_iter().enumerate(); */
diff --git a/src/compile_fail/no_send_par_iter.rs b/src/compile_fail/no_send_par_iter.rs
index 1362c98..7573c03 100644
--- a/src/compile_fail/no_send_par_iter.rs
+++ b/src/compile_fail/no_send_par_iter.rs
@@ -10,13 +10,11 @@
 
 unsafe impl Sync for NoSend {}
 
-fn main() {
-    let x = Some(NoSend(null()));
+let x = Some(NoSend(null()));
 
-    x.par_iter()
-        .map(|&x| x) //~ ERROR
-        .count(); //~ ERROR
-}
+x.par_iter()
+    .map(|&x| x) //~ ERROR
+    .count(); //~ ERROR
 
 ``` */
 mod map {}
@@ -31,13 +29,11 @@
 
 unsafe impl Sync for NoSend {}
 
-fn main() {
-    let x = Some(NoSend(null()));
+let x = Some(NoSend(null()));
 
-    x.par_iter()
-        .filter_map(|&x| Some(x)) //~ ERROR
-        .count(); //~ ERROR
-}
+x.par_iter()
+    .filter_map(|&x| Some(x)) //~ ERROR
+    .count(); //~ ERROR
 
 ``` */
 mod filter_map {}
@@ -52,13 +48,11 @@
 
 unsafe impl Sync for NoSend {}
 
-fn main() {
-    let x = Some(NoSend(null()));
+let x = Some(NoSend(null()));
 
-    x.par_iter()
-        .cloned() //~ ERROR
-        .count(); //~ ERROR
-}
+x.par_iter()
+    .cloned() //~ ERROR
+    .count(); //~ ERROR
 
 ``` */
 mod cloned {}
diff --git a/src/compile_fail/rc_par_iter.rs b/src/compile_fail/rc_par_iter.rs
index feaedb3..aca63e7 100644
--- a/src/compile_fail/rc_par_iter.rs
+++ b/src/compile_fail/rc_par_iter.rs
@@ -6,12 +6,10 @@
 use rayon::prelude::*;
 use std::rc::Rc;
 
-fn main() {
-    let x = vec![Rc::new(22), Rc::new(23)];
-    let mut y = vec![];
-    x.into_par_iter() //~ ERROR no method named `into_par_iter`
-     .map(|rc| *rc)
-     .collect_into_vec(&mut y);
-}
+let x = vec![Rc::new(22), Rc::new(23)];
+let mut y = vec![];
+x.into_par_iter() //~ ERROR no method named `into_par_iter`
+    .map(|rc| *rc)
+    .collect_into_vec(&mut y);
 
 ``` */
diff --git a/src/delegate.rs b/src/delegate.rs
index a537489..2a6e331 100644
--- a/src/delegate.rs
+++ b/src/delegate.rs
@@ -8,15 +8,6 @@
 /// declared with an `inner` field.
 ///
 /// The implementation of `IntoParallelIterator` should be added separately.
-///
-/// # Example
-///
-/// ```
-/// delegate_iterator!{
-///     MyIntoIter<T, U> => (T, U),
-///     impl<T: Ord + Send, U: Send>
-/// }
-/// ```
 macro_rules! delegate_iterator {
     ($iter:ty => $item:ty ,
      impl $( $args:tt )*
@@ -68,3 +59,51 @@
         }
     }
 }
+
+#[test]
+fn unindexed_example() {
+    use crate::collections::btree_map::IntoIter;
+    use crate::iter::plumbing::*;
+    use crate::prelude::*;
+
+    use std::collections::BTreeMap;
+
+    struct MyIntoIter<T: Ord + Send, U: Send> {
+        inner: IntoIter<T, U>,
+    }
+
+    delegate_iterator! {
+        MyIntoIter<T, U> => (T, U),
+        impl<T: Ord + Send, U: Send>
+    }
+
+    let map = BTreeMap::from([(1, 'a'), (2, 'b'), (3, 'c')]);
+    let iter = MyIntoIter {
+        inner: map.into_par_iter(),
+    };
+    let vec: Vec<_> = iter.map(|(k, _)| k).collect();
+    assert_eq!(vec, &[1, 2, 3]);
+}
+
+#[test]
+fn indexed_example() {
+    use crate::iter::plumbing::*;
+    use crate::prelude::*;
+    use crate::vec::IntoIter;
+
+    struct MyIntoIter<T: Send> {
+        inner: IntoIter<T>,
+    }
+
+    delegate_indexed_iterator! {
+        MyIntoIter<T> => T,
+        impl<T: Send>
+    }
+
+    let iter = MyIntoIter {
+        inner: vec![1, 2, 3].into_par_iter(),
+    };
+    let mut vec = vec![];
+    iter.collect_into_vec(&mut vec);
+    assert_eq!(vec, &[1, 2, 3]);
+}
diff --git a/src/iter/chunks.rs b/src/iter/chunks.rs
index be5f84c..ec48278 100644
--- a/src/iter/chunks.rs
+++ b/src/iter/chunks.rs
@@ -90,38 +90,46 @@
             where
                 P: Producer<Item = T>,
             {
-                self.callback.callback(ChunkProducer {
-                    chunk_size: self.size,
-                    len: self.len,
-                    base,
-                })
+                let producer = ChunkProducer::new(self.size, self.len, base, Vec::from_iter);
+                self.callback.callback(producer)
             }
         }
     }
 }
 
-struct ChunkProducer<P>
-where
-    P: Producer,
-{
+pub(super) struct ChunkProducer<P, F> {
     chunk_size: usize,
     len: usize,
     base: P,
+    map: F,
 }
 
-impl<P> Producer for ChunkProducer<P>
+impl<P, F> ChunkProducer<P, F> {
+    pub(super) fn new(chunk_size: usize, len: usize, base: P, map: F) -> Self {
+        Self {
+            chunk_size,
+            len,
+            base,
+            map,
+        }
+    }
+}
+
+impl<P, F, T> Producer for ChunkProducer<P, F>
 where
     P: Producer,
+    F: Fn(P::IntoIter) -> T + Send + Clone,
 {
-    type Item = Vec<P::Item>;
-    type IntoIter = ChunkSeq<P>;
+    type Item = T;
+    type IntoIter = std::iter::Map<ChunkSeq<P>, F>;
 
     fn into_iter(self) -> Self::IntoIter {
-        ChunkSeq {
+        let chunks = ChunkSeq {
             chunk_size: self.chunk_size,
             len: self.len,
             inner: if self.len > 0 { Some(self.base) } else { None },
-        }
+        };
+        chunks.map(self.map)
     }
 
     fn split_at(self, index: usize) -> (Self, Self) {
@@ -132,11 +140,13 @@
                 chunk_size: self.chunk_size,
                 len: elem_index,
                 base: left,
+                map: self.map.clone(),
             },
             ChunkProducer {
                 chunk_size: self.chunk_size,
                 len: self.len - elem_index,
                 base: right,
+                map: self.map,
             },
         )
     }
@@ -150,7 +160,7 @@
     }
 }
 
-struct ChunkSeq<P> {
+pub(super) struct ChunkSeq<P> {
     chunk_size: usize,
     len: usize,
     inner: Option<P>,
@@ -160,7 +170,7 @@
 where
     P: Producer,
 {
-    type Item = Vec<P::Item>;
+    type Item = P::IntoIter;
 
     fn next(&mut self) -> Option<Self::Item> {
         let producer = self.inner.take()?;
@@ -168,11 +178,11 @@
             let (left, right) = producer.split_at(self.chunk_size);
             self.inner = Some(right);
             self.len -= self.chunk_size;
-            Some(left.into_iter().collect())
+            Some(left.into_iter())
         } else {
             debug_assert!(self.len > 0);
             self.len = 0;
-            Some(producer.into_iter().collect())
+            Some(producer.into_iter())
         }
     }
 
@@ -206,11 +216,11 @@
             let (left, right) = producer.split_at(self.len - size);
             self.inner = Some(left);
             self.len -= size;
-            Some(right.into_iter().collect())
+            Some(right.into_iter())
         } else {
             debug_assert!(self.len > 0);
             self.len = 0;
-            Some(producer.into_iter().collect())
+            Some(producer.into_iter())
         }
     }
 }
diff --git a/src/iter/collect/consumer.rs b/src/iter/collect/consumer.rs
index 3a8eea0..acd67df 100644
--- a/src/iter/collect/consumer.rs
+++ b/src/iter/collect/consumer.rs
@@ -1,19 +1,38 @@
 use super::super::plumbing::*;
+use crate::SendPtr;
 use std::marker::PhantomData;
-use std::mem::MaybeUninit;
 use std::ptr;
 use std::slice;
 
 pub(super) struct CollectConsumer<'c, T: Send> {
-    /// A slice covering the target memory, not yet initialized!
-    target: &'c mut [MaybeUninit<T>],
+    /// See `CollectResult` for explanation of why this is not a slice
+    start: SendPtr<T>,
+    len: usize,
+    marker: PhantomData<&'c mut T>,
+}
+
+impl<T: Send> CollectConsumer<'_, T> {
+    /// Create a collector for `len` items in the unused capacity of the vector.
+    pub(super) fn appender(vec: &mut Vec<T>, len: usize) -> CollectConsumer<'_, T> {
+        let start = vec.len();
+        assert!(vec.capacity() - start >= len);
+
+        // SAFETY: We already made sure to have the additional space allocated.
+        // The pointer is derived from `Vec` directly, not through a `Deref`,
+        // so it has provenance over the whole allocation.
+        unsafe { CollectConsumer::new(vec.as_mut_ptr().add(start), len) }
+    }
 }
 
 impl<'c, T: Send + 'c> CollectConsumer<'c, T> {
     /// The target memory is considered uninitialized, and will be
     /// overwritten without reading or dropping existing values.
-    pub(super) fn new(target: &'c mut [MaybeUninit<T>]) -> Self {
-        CollectConsumer { target }
+    unsafe fn new(start: *mut T, len: usize) -> Self {
+        CollectConsumer {
+            start: SendPtr(start),
+            len,
+            marker: PhantomData,
+        }
     }
 }
 
@@ -23,10 +42,13 @@
 /// the elements will be dropped, unless its ownership is released before then.
 #[must_use]
 pub(super) struct CollectResult<'c, T> {
-    /// A slice covering the target memory, initialized up to our separate `len`.
-    target: &'c mut [MaybeUninit<T>],
-    /// The current initialized length in `target`
-    len: usize,
+    /// This pointer and length has the same representation as a slice,
+    /// but retains the provenance of the entire array so that we can merge
+    /// these regions together in `CollectReducer`.
+    start: SendPtr<T>,
+    total_len: usize,
+    /// The current initialized length after `start`
+    initialized_len: usize,
     /// Lifetime invariance guarantees that the data flows from consumer to result,
     /// especially for the `scope_fn` callback in `Collect::with_consumer`.
     invariant_lifetime: PhantomData<&'c mut &'c mut [T]>,
@@ -37,25 +59,26 @@
 impl<'c, T> CollectResult<'c, T> {
     /// The current length of the collect result
     pub(super) fn len(&self) -> usize {
-        self.len
+        self.initialized_len
     }
 
     /// Release ownership of the slice of elements, and return the length
     pub(super) fn release_ownership(mut self) -> usize {
-        let ret = self.len;
-        self.len = 0;
+        let ret = self.initialized_len;
+        self.initialized_len = 0;
         ret
     }
 }
 
 impl<'c, T> Drop for CollectResult<'c, T> {
     fn drop(&mut self) {
-        // Drop the first `self.len` elements, which have been recorded
+        // Drop the first `self.initialized_len` elements, which have been recorded
         // to be initialized by the folder.
         unsafe {
-            // TODO: use `MaybeUninit::slice_as_mut_ptr`
-            let start = self.target.as_mut_ptr() as *mut T;
-            ptr::drop_in_place(slice::from_raw_parts_mut(start, self.len));
+            ptr::drop_in_place(slice::from_raw_parts_mut(
+                self.start.0,
+                self.initialized_len,
+            ));
         }
     }
 }
@@ -66,24 +89,27 @@
     type Result = CollectResult<'c, T>;
 
     fn split_at(self, index: usize) -> (Self, Self, CollectReducer) {
-        let CollectConsumer { target } = self;
+        let CollectConsumer { start, len, .. } = self;
 
-        // Produce new consumers. Normal slicing ensures that the
-        // memory range given to each consumer is disjoint.
-        let (left, right) = target.split_at_mut(index);
-        (
-            CollectConsumer::new(left),
-            CollectConsumer::new(right),
-            CollectReducer,
-        )
+        // Produce new consumers.
+        // SAFETY: This assert checks that `index` is a valid offset for `start`
+        unsafe {
+            assert!(index <= len);
+            (
+                CollectConsumer::new(start.0, index),
+                CollectConsumer::new(start.0.add(index), len - index),
+                CollectReducer,
+            )
+        }
     }
 
     fn into_folder(self) -> Self::Folder {
         // Create a result/folder that consumes values and writes them
-        // into target. The initial result has length 0.
+        // into the region after start. The initial result has length 0.
         CollectResult {
-            target: self.target,
-            len: 0,
+            start: self.start,
+            total_len: self.len,
+            initialized_len: 0,
             invariant_lifetime: PhantomData,
         }
     }
@@ -97,15 +123,17 @@
     type Result = Self;
 
     fn consume(mut self, item: T) -> Self {
-        let dest = self
-            .target
-            .get_mut(self.len)
-            .expect("too many values pushed to consumer");
+        assert!(
+            self.initialized_len < self.total_len,
+            "too many values pushed to consumer"
+        );
 
-        // Write item and increase the initialized length
+        // SAFETY: The assert above is a bounds check for this write, and we
+        // avoid assignment here so we do not drop an uninitialized T.
         unsafe {
-            dest.as_mut_ptr().write(item);
-            self.len += 1;
+            // Write item and increase the initialized length
+            self.start.0.add(self.initialized_len).write(item);
+            self.initialized_len += 1;
         }
 
         self
@@ -146,14 +174,13 @@
         // Merge if the CollectResults are adjacent and in left to right order
         // else: drop the right piece now and total length will end up short in the end,
         // when the correctness of the collected result is asserted.
-        let left_end = left.target[left.len..].as_ptr();
-        if left_end == right.target.as_ptr() {
-            let len = left.len + right.release_ownership();
-            unsafe {
-                left.target = slice::from_raw_parts_mut(left.target.as_mut_ptr(), len);
+        unsafe {
+            let left_end = left.start.0.add(left.initialized_len);
+            if left_end == right.start.0 {
+                left.total_len += right.total_len;
+                left.initialized_len += right.release_ownership();
             }
-            left.len = len;
+            left
         }
-        left
     }
 }
diff --git a/src/iter/collect/mod.rs b/src/iter/collect/mod.rs
index 7cbf215..4044a68 100644
--- a/src/iter/collect/mod.rs
+++ b/src/iter/collect/mod.rs
@@ -1,6 +1,4 @@
-use super::{IndexedParallelIterator, IntoParallelIterator, ParallelExtend, ParallelIterator};
-use std::mem::MaybeUninit;
-use std::slice;
+use super::{IndexedParallelIterator, ParallelIterator};
 
 mod consumer;
 use self::consumer::CollectConsumer;
@@ -19,7 +17,7 @@
 {
     v.truncate(0); // clear any old data
     let len = pi.len();
-    Collect::new(v, len).with_consumer(|consumer| pi.drive(consumer));
+    collect_with_consumer(v, len, |consumer| pi.drive(consumer));
 }
 
 /// Collects the results of the iterator into the specified vector.
@@ -33,12 +31,12 @@
 /// *any* `ParallelIterator` here, and `CollectConsumer` has to also implement
 /// `UnindexedConsumer`.  That implementation panics `unreachable!` in case
 /// there's a bug where we actually do try to use this unindexed.
-fn special_extend<I, T>(pi: I, len: usize, v: &mut Vec<T>)
+pub(super) fn special_extend<I, T>(pi: I, len: usize, v: &mut Vec<T>)
 where
     I: ParallelIterator<Item = T>,
     T: Send,
 {
-    Collect::new(v, len).with_consumer(|consumer| pi.drive_unindexed(consumer));
+    collect_with_consumer(v, len, |consumer| pi.drive_unindexed(consumer));
 }
 
 /// Unzips the results of the exact iterator into the specified vectors.
@@ -55,9 +53,9 @@
     right.truncate(0);
 
     let len = pi.len();
-    Collect::new(right, len).with_consumer(|right_consumer| {
+    collect_with_consumer(right, len, |right_consumer| {
         let mut right_result = None;
-        Collect::new(left, len).with_consumer(|left_consumer| {
+        collect_with_consumer(left, len, |left_consumer| {
             let (left_r, right_r) = unzip_indexed(pi, left_consumer, right_consumer);
             right_result = Some(right_r);
             left_r
@@ -66,108 +64,53 @@
     });
 }
 
-/// Manage the collection vector.
-struct Collect<'c, T: Send> {
-    vec: &'c mut Vec<T>,
-    len: usize,
-}
-
-impl<'c, T: Send + 'c> Collect<'c, T> {
-    fn new(vec: &'c mut Vec<T>, len: usize) -> Self {
-        Collect { vec, len }
-    }
-
-    /// Create a consumer on the slice of memory we are collecting into.
-    ///
-    /// The consumer needs to be used inside the scope function, and the
-    /// complete collect result passed back.
-    ///
-    /// This method will verify the collect result, and panic if the slice
-    /// was not fully written into. Otherwise, in the successful case,
-    /// the vector is complete with the collected result.
-    fn with_consumer<F>(mut self, scope_fn: F)
-    where
-        F: FnOnce(CollectConsumer<'_, T>) -> CollectResult<'_, T>,
-    {
-        let slice = Self::reserve_get_tail_slice(&mut self.vec, self.len);
-        let result = scope_fn(CollectConsumer::new(slice));
-
-        // The CollectResult represents a contiguous part of the
-        // slice, that has been written to.
-        // On unwind here, the CollectResult will be dropped.
-        // If some producers on the way did not produce enough elements,
-        // partial CollectResults may have been dropped without
-        // being reduced to the final result, and we will see
-        // that as the length coming up short.
-        //
-        // Here, we assert that `slice` is fully initialized. This is
-        // checked by the following assert, which verifies if a
-        // complete CollectResult was produced; if the length is
-        // correct, it is necessarily covering the target slice.
-        // Since we know that the consumer cannot have escaped from
-        // `drive` (by parametricity, essentially), we know that any
-        // stores that will happen, have happened. Unless some code is buggy,
-        // that means we should have seen `len` total writes.
-        let actual_writes = result.len();
-        assert!(
-            actual_writes == self.len,
-            "expected {} total writes, but got {}",
-            self.len,
-            actual_writes
-        );
-
-        // Release the result's mutable borrow and "proxy ownership"
-        // of the elements, before the vector takes it over.
-        result.release_ownership();
-
-        let new_len = self.vec.len() + self.len;
-
-        unsafe {
-            self.vec.set_len(new_len);
-        }
-    }
-
-    /// Reserve space for `len` more elements in the vector,
-    /// and return a slice to the uninitialized tail of the vector
-    fn reserve_get_tail_slice(vec: &mut Vec<T>, len: usize) -> &mut [MaybeUninit<T>] {
-        // Reserve the new space.
-        vec.reserve(len);
-
-        // TODO: use `Vec::spare_capacity_mut` instead
-        // SAFETY: `MaybeUninit<T>` is guaranteed to have the same layout
-        // as `T`, and we already made sure to have the additional space.
-        let start = vec.len();
-        let tail_ptr = vec[start..].as_mut_ptr() as *mut MaybeUninit<T>;
-        unsafe { slice::from_raw_parts_mut(tail_ptr, len) }
-    }
-}
-
-/// Extends a vector with items from a parallel iterator.
-impl<T> ParallelExtend<T> for Vec<T>
+/// Create a consumer on the slice of memory we are collecting into.
+///
+/// The consumer needs to be used inside the scope function, and the
+/// complete collect result passed back.
+///
+/// This method will verify the collect result, and panic if the slice
+/// was not fully written into. Otherwise, in the successful case,
+/// the vector is complete with the collected result.
+fn collect_with_consumer<T, F>(vec: &mut Vec<T>, len: usize, scope_fn: F)
 where
     T: Send,
+    F: FnOnce(CollectConsumer<'_, T>) -> CollectResult<'_, T>,
 {
-    fn par_extend<I>(&mut self, par_iter: I)
-    where
-        I: IntoParallelIterator<Item = T>,
-    {
-        // See the vec_collect benchmarks in rayon-demo for different strategies.
-        let par_iter = par_iter.into_par_iter();
-        match par_iter.opt_len() {
-            Some(len) => {
-                // When Rust gets specialization, we can get here for indexed iterators
-                // without relying on `opt_len`.  Until then, `special_extend()` fakes
-                // an unindexed mode on the promise that `opt_len()` is accurate.
-                special_extend(par_iter, len, self);
-            }
-            None => {
-                // This works like `extend`, but `Vec::append` is more efficient.
-                let list = super::extend::collect(par_iter);
-                self.reserve(super::extend::len(&list));
-                for mut vec in list {
-                    self.append(&mut vec);
-                }
-            }
-        }
+    // Reserve space for `len` more elements in the vector,
+    vec.reserve(len);
+
+    // Create the consumer and run the callback for collection.
+    let result = scope_fn(CollectConsumer::appender(vec, len));
+
+    // The `CollectResult` represents a contiguous part of the slice, that has
+    // been written to. On unwind here, the `CollectResult` will be dropped. If
+    // some producers on the way did not produce enough elements, partial
+    // `CollectResult`s may have been dropped without being reduced to the final
+    // result, and we will see that as the length coming up short.
+    //
+    // Here, we assert that added length is fully initialized. This is checked
+    // by the following assert, which verifies if a complete `CollectResult`
+    // was produced; if the length is correct, it is necessarily covering the
+    // target slice. Since we know that the consumer cannot have escaped from
+    // `drive` (by parametricity, essentially), we know that any stores that
+    // will happen, have happened. Unless some code is buggy, that means we
+    // should have seen `len` total writes.
+    let actual_writes = result.len();
+    assert!(
+        actual_writes == len,
+        "expected {} total writes, but got {}",
+        len,
+        actual_writes
+    );
+
+    // Release the result's mutable borrow and "proxy ownership"
+    // of the elements, before the vector takes it over.
+    result.release_ownership();
+
+    let new_len = vec.len() + len;
+
+    unsafe {
+        vec.set_len(new_len);
     }
 }
diff --git a/src/iter/collect/test.rs b/src/iter/collect/test.rs
index ddf7757..97bec3f 100644
--- a/src/iter/collect/test.rs
+++ b/src/iter/collect/test.rs
@@ -5,7 +5,7 @@
 // try to drive the "collect consumer" incorrectly. These should
 // result in panics.
 
-use super::Collect;
+use super::collect_with_consumer;
 use crate::iter::plumbing::*;
 use rayon_core::join;
 
@@ -20,7 +20,7 @@
 #[should_panic(expected = "too many values")]
 fn produce_too_many_items() {
     let mut v = vec![];
-    Collect::new(&mut v, 2).with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 2, |consumer| {
         let mut folder = consumer.into_folder();
         folder = folder.consume(22);
         folder = folder.consume(23);
@@ -35,8 +35,7 @@
 #[should_panic(expected = "expected 5 total writes, but got 2")]
 fn produce_fewer_items() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 5);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 5, |consumer| {
         let mut folder = consumer.into_folder();
         folder = folder.consume(22);
         folder = folder.consume(23);
@@ -49,8 +48,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 2")]
 fn left_produces_items_with_no_complete() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -66,8 +64,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 2")]
 fn right_produces_items_with_no_complete() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -79,12 +76,12 @@
 
 // Complete is not called by the consumer. Hence,the collection vector is not fully initialized.
 #[test]
+#[cfg_attr(not(panic = "unwind"), ignore)]
 fn produces_items_with_no_complete() {
     let counter = DropCounter::default();
     let mut v = vec![];
     let panic_result = panic::catch_unwind(panic::AssertUnwindSafe(|| {
-        let collect = Collect::new(&mut v, 2);
-        collect.with_consumer(|consumer| {
+        collect_with_consumer(&mut v, 2, |consumer| {
             let mut folder = consumer.into_folder();
             folder = folder.consume(counter.element());
             folder = folder.consume(counter.element());
@@ -102,8 +99,7 @@
 #[should_panic(expected = "too many values")]
 fn left_produces_too_many_items() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -120,8 +116,7 @@
 #[should_panic(expected = "too many values")]
 fn right_produces_too_many_items() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -138,8 +133,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 1")]
 fn left_produces_fewer_items() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let reducer = consumer.to_reducer();
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
@@ -158,8 +152,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 2")]
 fn only_left_result() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -177,8 +170,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 2")]
 fn only_right_result() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
         let mut right_folder = right_consumer.into_folder();
@@ -195,8 +187,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 2")]
 fn reducer_does_not_preserve_order() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let reducer = consumer.to_reducer();
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
@@ -215,8 +206,7 @@
 #[should_panic(expected = "expected 4 total writes, but got 3")]
 fn right_produces_fewer_items() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let reducer = consumer.to_reducer();
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let mut left_folder = left_consumer.into_folder();
@@ -230,13 +220,12 @@
 }
 
 // The left consumer panics and the right stops short, like `panic_fuse()`.
-// We should get the left panic without finishing `Collect::with_consumer`.
+// We should get the left panic without finishing `collect_with_consumer`.
 #[test]
 #[should_panic(expected = "left consumer panic")]
 fn left_panics() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let reducer = consumer.to_reducer();
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let (left_result, right_result) = join(
@@ -257,13 +246,12 @@
 }
 
 // The right consumer panics and the left stops short, like `panic_fuse()`.
-// We should get the right panic without finishing `Collect::with_consumer`.
+// We should get the right panic without finishing `collect_with_consumer`.
 #[test]
 #[should_panic(expected = "right consumer panic")]
 fn right_panics() {
     let mut v = vec![];
-    let collect = Collect::new(&mut v, 4);
-    collect.with_consumer(|consumer| {
+    collect_with_consumer(&mut v, 4, |consumer| {
         let reducer = consumer.to_reducer();
         let (left_consumer, right_consumer, _) = consumer.split_at(2);
         let (left_result, right_result) = join(
@@ -286,12 +274,12 @@
 // The left consumer produces fewer items while the right
 // consumer produces correct number; check that created elements are dropped
 #[test]
+#[cfg_attr(not(panic = "unwind"), ignore)]
 fn left_produces_fewer_items_drops() {
     let counter = DropCounter::default();
     let mut v = vec![];
     let panic_result = panic::catch_unwind(panic::AssertUnwindSafe(|| {
-        let collect = Collect::new(&mut v, 4);
-        collect.with_consumer(|consumer| {
+        collect_with_consumer(&mut v, 4, |consumer| {
             let reducer = consumer.to_reducer();
             let (left_consumer, right_consumer, _) = consumer.split_at(2);
             let mut left_folder = left_consumer.into_folder();
diff --git a/src/iter/extend.rs b/src/iter/extend.rs
index fb89249..1769d47 100644
--- a/src/iter/extend.rs
+++ b/src/iter/extend.rs
@@ -1,4 +1,5 @@
 use super::noop::NoopConsumer;
+use super::plumbing::{Consumer, Folder, Reducer, UnindexedConsumer};
 use super::{IntoParallelIterator, ParallelExtend, ParallelIterator};
 
 use std::borrow::Cow;
@@ -9,55 +10,91 @@
 
 /// Performs a generic `par_extend` by collecting to a `LinkedList<Vec<_>>` in
 /// parallel, then extending the collection sequentially.
-fn extend<C, I, F>(collection: &mut C, par_iter: I, reserve: F)
-where
-    I: IntoParallelIterator,
-    F: FnOnce(&mut C, &LinkedList<Vec<I::Item>>),
-    C: Extend<I::Item>,
-{
-    let list = collect(par_iter);
-    reserve(collection, &list);
-    for vec in list {
-        collection.extend(vec);
-    }
-}
-
-pub(super) fn collect<I>(par_iter: I) -> LinkedList<Vec<I::Item>>
-where
-    I: IntoParallelIterator,
-{
-    par_iter
-        .into_par_iter()
-        .fold(Vec::new, vec_push)
-        .map(as_list)
-        .reduce(LinkedList::new, list_append)
-}
-
-fn vec_push<T>(mut vec: Vec<T>, elem: T) -> Vec<T> {
-    vec.push(elem);
-    vec
-}
-
-fn as_list<T>(item: T) -> LinkedList<T> {
-    let mut list = LinkedList::new();
-    list.push_back(item);
-    list
-}
-
-fn list_append<T>(mut list1: LinkedList<T>, mut list2: LinkedList<T>) -> LinkedList<T> {
-    list1.append(&mut list2);
-    list1
+macro_rules! extend {
+    ($self:ident, $par_iter:ident, $extend:ident) => {
+        $extend(
+            $self,
+            $par_iter.into_par_iter().drive_unindexed(ListVecConsumer),
+        );
+    };
 }
 
 /// Computes the total length of a `LinkedList<Vec<_>>`.
-pub(super) fn len<T>(list: &LinkedList<Vec<T>>) -> usize {
+fn len<T>(list: &LinkedList<Vec<T>>) -> usize {
     list.iter().map(Vec::len).sum()
 }
 
-fn no_reserve<C, T>(_: &mut C, _: &LinkedList<Vec<T>>) {}
+struct ListVecConsumer;
 
-fn heap_reserve<T: Ord, U>(heap: &mut BinaryHeap<T>, list: &LinkedList<Vec<U>>) {
-    heap.reserve(len(list));
+struct ListVecFolder<T> {
+    vec: Vec<T>,
+}
+
+impl<T: Send> Consumer<T> for ListVecConsumer {
+    type Folder = ListVecFolder<T>;
+    type Reducer = ListReducer;
+    type Result = LinkedList<Vec<T>>;
+
+    fn split_at(self, _index: usize) -> (Self, Self, Self::Reducer) {
+        (Self, Self, ListReducer)
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        ListVecFolder { vec: Vec::new() }
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+impl<T: Send> UnindexedConsumer<T> for ListVecConsumer {
+    fn split_off_left(&self) -> Self {
+        Self
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        ListReducer
+    }
+}
+
+impl<T> Folder<T> for ListVecFolder<T> {
+    type Result = LinkedList<Vec<T>>;
+
+    fn consume(mut self, item: T) -> Self {
+        self.vec.push(item);
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.vec.extend(iter);
+        self
+    }
+
+    fn complete(self) -> Self::Result {
+        let mut list = LinkedList::new();
+        if !self.vec.is_empty() {
+            list.push_back(self.vec);
+        }
+        list
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+fn heap_extend<T, Item>(heap: &mut BinaryHeap<T>, list: LinkedList<Vec<Item>>)
+where
+    BinaryHeap<T>: Extend<Item>,
+{
+    heap.reserve(len(&list));
+    for vec in list {
+        heap.extend(vec);
+    }
 }
 
 /// Extends a binary heap with items from a parallel iterator.
@@ -69,7 +106,7 @@
     where
         I: IntoParallelIterator<Item = T>,
     {
-        extend(self, par_iter, heap_reserve);
+        extend!(self, par_iter, heap_extend);
     }
 }
 
@@ -82,7 +119,16 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        extend(self, par_iter, heap_reserve);
+        extend!(self, par_iter, heap_extend);
+    }
+}
+
+fn btree_map_extend<K, V, Item>(map: &mut BTreeMap<K, V>, list: LinkedList<Vec<Item>>)
+where
+    BTreeMap<K, V>: Extend<Item>,
+{
+    for vec in list {
+        map.extend(vec);
     }
 }
 
@@ -96,7 +142,7 @@
     where
         I: IntoParallelIterator<Item = (K, V)>,
     {
-        extend(self, par_iter, no_reserve);
+        extend!(self, par_iter, btree_map_extend);
     }
 }
 
@@ -110,7 +156,16 @@
     where
         I: IntoParallelIterator<Item = (&'a K, &'a V)>,
     {
-        extend(self, par_iter, no_reserve);
+        extend!(self, par_iter, btree_map_extend);
+    }
+}
+
+fn btree_set_extend<T, Item>(set: &mut BTreeSet<T>, list: LinkedList<Vec<Item>>)
+where
+    BTreeSet<T>: Extend<Item>,
+{
+    for vec in list {
+        set.extend(vec);
     }
 }
 
@@ -123,7 +178,7 @@
     where
         I: IntoParallelIterator<Item = T>,
     {
-        extend(self, par_iter, no_reserve);
+        extend!(self, par_iter, btree_set_extend);
     }
 }
 
@@ -136,16 +191,20 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        extend(self, par_iter, no_reserve);
+        extend!(self, par_iter, btree_set_extend);
     }
 }
 
-fn map_reserve<K, V, S, U>(map: &mut HashMap<K, V, S>, list: &LinkedList<Vec<U>>)
+fn hash_map_extend<K, V, S, Item>(map: &mut HashMap<K, V, S>, list: LinkedList<Vec<Item>>)
 where
+    HashMap<K, V, S>: Extend<Item>,
     K: Eq + Hash,
     S: BuildHasher,
 {
-    map.reserve(len(list));
+    map.reserve(len(&list));
+    for vec in list {
+        map.extend(vec);
+    }
 }
 
 /// Extends a hash map with items from a parallel iterator.
@@ -160,7 +219,7 @@
         I: IntoParallelIterator<Item = (K, V)>,
     {
         // See the map_collect benchmarks in rayon-demo for different strategies.
-        extend(self, par_iter, map_reserve);
+        extend!(self, par_iter, hash_map_extend);
     }
 }
 
@@ -175,16 +234,20 @@
     where
         I: IntoParallelIterator<Item = (&'a K, &'a V)>,
     {
-        extend(self, par_iter, map_reserve);
+        extend!(self, par_iter, hash_map_extend);
     }
 }
 
-fn set_reserve<T, S, U>(set: &mut HashSet<T, S>, list: &LinkedList<Vec<U>>)
+fn hash_set_extend<T, S, Item>(set: &mut HashSet<T, S>, list: LinkedList<Vec<Item>>)
 where
+    HashSet<T, S>: Extend<Item>,
     T: Eq + Hash,
     S: BuildHasher,
 {
-    set.reserve(len(list));
+    set.reserve(len(&list));
+    for vec in list {
+        set.extend(vec);
+    }
 }
 
 /// Extends a hash set with items from a parallel iterator.
@@ -197,7 +260,7 @@
     where
         I: IntoParallelIterator<Item = T>,
     {
-        extend(self, par_iter, set_reserve);
+        extend!(self, par_iter, hash_set_extend);
     }
 }
 
@@ -211,15 +274,10 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        extend(self, par_iter, set_reserve);
+        extend!(self, par_iter, hash_set_extend);
     }
 }
 
-fn list_push_back<T>(mut list: LinkedList<T>, elem: T) -> LinkedList<T> {
-    list.push_back(elem);
-    list
-}
-
 /// Extends a linked list with items from a parallel iterator.
 impl<T> ParallelExtend<T> for LinkedList<T>
 where
@@ -229,10 +287,7 @@
     where
         I: IntoParallelIterator<Item = T>,
     {
-        let mut list = par_iter
-            .into_par_iter()
-            .fold(LinkedList::new, list_push_back)
-            .reduce(LinkedList::new, list_append);
+        let mut list = par_iter.into_par_iter().drive_unindexed(ListConsumer);
         self.append(&mut list);
     }
 }
@@ -246,13 +301,83 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        self.par_extend(par_iter.into_par_iter().cloned())
+        self.par_extend(par_iter.into_par_iter().copied())
     }
 }
 
-fn string_push(mut string: String, ch: char) -> String {
-    string.push(ch);
-    string
+struct ListConsumer;
+
+struct ListFolder<T> {
+    list: LinkedList<T>,
+}
+
+struct ListReducer;
+
+impl<T: Send> Consumer<T> for ListConsumer {
+    type Folder = ListFolder<T>;
+    type Reducer = ListReducer;
+    type Result = LinkedList<T>;
+
+    fn split_at(self, _index: usize) -> (Self, Self, Self::Reducer) {
+        (Self, Self, ListReducer)
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        ListFolder {
+            list: LinkedList::new(),
+        }
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+impl<T: Send> UnindexedConsumer<T> for ListConsumer {
+    fn split_off_left(&self) -> Self {
+        Self
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        ListReducer
+    }
+}
+
+impl<T> Folder<T> for ListFolder<T> {
+    type Result = LinkedList<T>;
+
+    fn consume(mut self, item: T) -> Self {
+        self.list.push_back(item);
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.list.extend(iter);
+        self
+    }
+
+    fn complete(self) -> Self::Result {
+        self.list
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+impl<T> Reducer<LinkedList<T>> for ListReducer {
+    fn reduce(self, mut left: LinkedList<T>, mut right: LinkedList<T>) -> LinkedList<T> {
+        left.append(&mut right);
+        left
+    }
+}
+
+fn flat_string_extend(string: &mut String, list: LinkedList<String>) {
+    string.reserve(list.iter().map(String::len).sum());
+    string.extend(list);
 }
 
 /// Extends a string with characters from a parallel iterator.
@@ -263,14 +388,8 @@
     {
         // This is like `extend`, but `Vec<char>` is less efficient to deal
         // with than `String`, so instead collect to `LinkedList<String>`.
-        let list: LinkedList<_> = par_iter
-            .into_par_iter()
-            .fold(String::new, string_push)
-            .map(as_list)
-            .reduce(LinkedList::new, list_append);
-
-        self.reserve(list.iter().map(String::len).sum());
-        self.extend(list)
+        let list = par_iter.into_par_iter().drive_unindexed(ListStringConsumer);
+        flat_string_extend(self, list);
     }
 }
 
@@ -280,13 +399,85 @@
     where
         I: IntoParallelIterator<Item = &'a char>,
     {
-        self.par_extend(par_iter.into_par_iter().cloned())
+        self.par_extend(par_iter.into_par_iter().copied())
     }
 }
 
-fn string_reserve<T: AsRef<str>>(string: &mut String, list: &LinkedList<Vec<T>>) {
-    let len = list.iter().flatten().map(T::as_ref).map(str::len).sum();
+struct ListStringConsumer;
+
+struct ListStringFolder {
+    string: String,
+}
+
+impl Consumer<char> for ListStringConsumer {
+    type Folder = ListStringFolder;
+    type Reducer = ListReducer;
+    type Result = LinkedList<String>;
+
+    fn split_at(self, _index: usize) -> (Self, Self, Self::Reducer) {
+        (Self, Self, ListReducer)
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        ListStringFolder {
+            string: String::new(),
+        }
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+impl UnindexedConsumer<char> for ListStringConsumer {
+    fn split_off_left(&self) -> Self {
+        Self
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        ListReducer
+    }
+}
+
+impl Folder<char> for ListStringFolder {
+    type Result = LinkedList<String>;
+
+    fn consume(mut self, item: char) -> Self {
+        self.string.push(item);
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = char>,
+    {
+        self.string.extend(iter);
+        self
+    }
+
+    fn complete(self) -> Self::Result {
+        let mut list = LinkedList::new();
+        if !self.string.is_empty() {
+            list.push_back(self.string);
+        }
+        list
+    }
+
+    fn full(&self) -> bool {
+        false
+    }
+}
+
+fn string_extend<Item>(string: &mut String, list: LinkedList<Vec<Item>>)
+where
+    String: Extend<Item>,
+    Item: AsRef<str>,
+{
+    let len = list.iter().flatten().map(Item::as_ref).map(str::len).sum();
     string.reserve(len);
+    for vec in list {
+        string.extend(vec);
+    }
 }
 
 /// Extends a string with string slices from a parallel iterator.
@@ -295,7 +486,7 @@
     where
         I: IntoParallelIterator<Item = &'a str>,
     {
-        extend(self, par_iter, string_reserve);
+        extend!(self, par_iter, string_extend);
     }
 }
 
@@ -305,7 +496,7 @@
     where
         I: IntoParallelIterator<Item = String>,
     {
-        extend(self, par_iter, string_reserve);
+        extend!(self, par_iter, string_extend);
     }
 }
 
@@ -315,12 +506,18 @@
     where
         I: IntoParallelIterator<Item = Cow<'a, str>>,
     {
-        extend(self, par_iter, string_reserve);
+        extend!(self, par_iter, string_extend);
     }
 }
 
-fn deque_reserve<T, U>(deque: &mut VecDeque<T>, list: &LinkedList<Vec<U>>) {
-    deque.reserve(len(list));
+fn deque_extend<T, Item>(deque: &mut VecDeque<T>, list: LinkedList<Vec<Item>>)
+where
+    VecDeque<T>: Extend<Item>,
+{
+    deque.reserve(len(&list));
+    for vec in list {
+        deque.extend(vec);
+    }
 }
 
 /// Extends a deque with items from a parallel iterator.
@@ -332,7 +529,7 @@
     where
         I: IntoParallelIterator<Item = T>,
     {
-        extend(self, par_iter, deque_reserve);
+        extend!(self, par_iter, deque_extend);
     }
 }
 
@@ -345,12 +542,43 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        extend(self, par_iter, deque_reserve);
+        extend!(self, par_iter, deque_extend);
     }
 }
 
-// See the `collect` module for the `Vec<T>` implementation.
-// impl<T> ParallelExtend<T> for Vec<T>
+fn vec_append<T>(vec: &mut Vec<T>, list: LinkedList<Vec<T>>) {
+    vec.reserve(len(&list));
+    for mut other in list {
+        vec.append(&mut other);
+    }
+}
+
+/// Extends a vector with items from a parallel iterator.
+impl<T> ParallelExtend<T> for Vec<T>
+where
+    T: Send,
+{
+    fn par_extend<I>(&mut self, par_iter: I)
+    where
+        I: IntoParallelIterator<Item = T>,
+    {
+        // See the vec_collect benchmarks in rayon-demo for different strategies.
+        let par_iter = par_iter.into_par_iter();
+        match par_iter.opt_len() {
+            Some(len) => {
+                // When Rust gets specialization, we can get here for indexed iterators
+                // without relying on `opt_len`.  Until then, `special_extend()` fakes
+                // an unindexed mode on the promise that `opt_len()` is accurate.
+                super::collect::special_extend(par_iter, len, self);
+            }
+            None => {
+                // This works like `extend`, but `Vec::append` is more efficient.
+                let list = par_iter.drive_unindexed(ListVecConsumer);
+                vec_append(self, list);
+            }
+        }
+    }
+}
 
 /// Extends a vector with copied items from a parallel iterator.
 impl<'a, T> ParallelExtend<&'a T> for Vec<T>
@@ -361,7 +589,7 @@
     where
         I: IntoParallelIterator<Item = &'a T>,
     {
-        self.par_extend(par_iter.into_par_iter().cloned())
+        self.par_extend(par_iter.into_par_iter().copied())
     }
 }
 
diff --git a/src/iter/filter.rs b/src/iter/filter.rs
index 38627f7..e1b74ba 100644
--- a/src/iter/filter.rs
+++ b/src/iter/filter.rs
@@ -97,7 +97,7 @@
     P: Fn(&T) -> bool + Sync,
 {
     fn split_off_left(&self) -> Self {
-        FilterConsumer::new(self.base.split_off_left(), &self.filter_op)
+        FilterConsumer::new(self.base.split_off_left(), self.filter_op)
     }
 
     fn to_reducer(&self) -> Self::Reducer {
diff --git a/src/iter/filter_map.rs b/src/iter/filter_map.rs
index f19c385..db1c7e3 100644
--- a/src/iter/filter_map.rs
+++ b/src/iter/filter_map.rs
@@ -98,7 +98,7 @@
     P: Fn(T) -> Option<U> + Sync + 'p,
 {
     fn split_off_left(&self) -> Self {
-        FilterMapConsumer::new(self.base.split_off_left(), &self.filter_op)
+        FilterMapConsumer::new(self.base.split_off_left(), self.filter_op)
     }
 
     fn to_reducer(&self) -> Self::Reducer {
diff --git a/src/iter/find.rs b/src/iter/find.rs
index 971db2b..b16ee84 100644
--- a/src/iter/find.rs
+++ b/src/iter/find.rs
@@ -94,7 +94,7 @@
         self.item = iter
             .into_iter()
             // stop iterating if another thread has found something
-            .take_while(not_full(&self.found))
+            .take_while(not_full(self.found))
             .find(self.find_op);
         if self.item.is_some() {
             self.found.store(true, Ordering::Relaxed)
diff --git a/src/iter/fold_chunks.rs b/src/iter/fold_chunks.rs
new file mode 100644
index 0000000..185fb1a
--- /dev/null
+++ b/src/iter/fold_chunks.rs
@@ -0,0 +1,236 @@
+use std::fmt::{self, Debug};
+
+use super::chunks::ChunkProducer;
+use super::plumbing::*;
+use super::*;
+use crate::math::div_round_up;
+
+/// `FoldChunks` is an iterator that groups elements of an underlying iterator and applies a
+/// function over them, producing a single value for each group.
+///
+/// This struct is created by the [`fold_chunks()`] method on [`IndexedParallelIterator`]
+///
+/// [`fold_chunks()`]: trait.IndexedParallelIterator.html#method.fold_chunks
+/// [`IndexedParallelIterator`]: trait.IndexedParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone)]
+pub struct FoldChunks<I, ID, F>
+where
+    I: IndexedParallelIterator,
+{
+    base: I,
+    chunk_size: usize,
+    fold_op: F,
+    identity: ID,
+}
+
+impl<I: IndexedParallelIterator + Debug, ID, F> Debug for FoldChunks<I, ID, F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Fold")
+            .field("base", &self.base)
+            .field("chunk_size", &self.chunk_size)
+            .finish()
+    }
+}
+
+impl<I, ID, U, F> FoldChunks<I, ID, F>
+where
+    I: IndexedParallelIterator,
+    ID: Fn() -> U + Send + Sync,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+    U: Send,
+{
+    /// Creates a new `FoldChunks` iterator
+    pub(super) fn new(base: I, chunk_size: usize, identity: ID, fold_op: F) -> Self {
+        FoldChunks {
+            base,
+            chunk_size,
+            identity,
+            fold_op,
+        }
+    }
+}
+
+impl<I, ID, U, F> ParallelIterator for FoldChunks<I, ID, F>
+where
+    I: IndexedParallelIterator,
+    ID: Fn() -> U + Send + Sync,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+    U: Send,
+{
+    type Item = U;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<U>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<I, ID, U, F> IndexedParallelIterator for FoldChunks<I, ID, F>
+where
+    I: IndexedParallelIterator,
+    ID: Fn() -> U + Send + Sync,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+    U: Send,
+{
+    fn len(&self) -> usize {
+        div_round_up(self.base.len(), self.chunk_size)
+    }
+
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        let len = self.base.len();
+        return self.base.with_producer(Callback {
+            chunk_size: self.chunk_size,
+            len,
+            identity: self.identity,
+            fold_op: self.fold_op,
+            callback,
+        });
+
+        struct Callback<CB, ID, F> {
+            chunk_size: usize,
+            len: usize,
+            identity: ID,
+            fold_op: F,
+            callback: CB,
+        }
+
+        impl<T, CB, ID, U, F> ProducerCallback<T> for Callback<CB, ID, F>
+        where
+            CB: ProducerCallback<U>,
+            ID: Fn() -> U + Send + Sync,
+            F: Fn(U, T) -> U + Send + Sync,
+        {
+            type Output = CB::Output;
+
+            fn callback<P>(self, base: P) -> CB::Output
+            where
+                P: Producer<Item = T>,
+            {
+                let identity = &self.identity;
+                let fold_op = &self.fold_op;
+                let fold_iter = move |iter: P::IntoIter| iter.fold(identity(), fold_op);
+                let producer = ChunkProducer::new(self.chunk_size, self.len, base, fold_iter);
+                self.callback.callback(producer)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use std::ops::Add;
+
+    #[test]
+    fn check_fold_chunks() {
+        let words = "bishbashbosh!"
+            .chars()
+            .collect::<Vec<_>>()
+            .into_par_iter()
+            .fold_chunks(4, String::new, |mut s, c| {
+                s.push(c);
+                s
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(words, vec!["bish", "bash", "bosh", "!"]);
+    }
+
+    // 'closure' values for tests below
+    fn id() -> i32 {
+        0
+    }
+    fn sum<T, U>(x: T, y: U) -> T
+    where
+        T: Add<U, Output = T>,
+    {
+        x + y
+    }
+
+    #[test]
+    #[should_panic(expected = "chunk_size must not be zero")]
+    fn check_fold_chunks_zero_size() {
+        let _: Vec<i32> = vec![1, 2, 3]
+            .into_par_iter()
+            .fold_chunks(0, id, sum)
+            .collect();
+    }
+
+    #[test]
+    fn check_fold_chunks_even_size() {
+        assert_eq!(
+            vec![1 + 2 + 3, 4 + 5 + 6, 7 + 8 + 9],
+            (1..10)
+                .into_par_iter()
+                .fold_chunks(3, id, sum)
+                .collect::<Vec<i32>>()
+        );
+    }
+
+    #[test]
+    fn check_fold_chunks_empty() {
+        let v: Vec<i32> = vec![];
+        let expected: Vec<i32> = vec![];
+        assert_eq!(
+            expected,
+            v.into_par_iter()
+                .fold_chunks(2, id, sum)
+                .collect::<Vec<i32>>()
+        );
+    }
+
+    #[test]
+    fn check_fold_chunks_len() {
+        assert_eq!(4, (0..8).into_par_iter().fold_chunks(2, id, sum).len());
+        assert_eq!(3, (0..9).into_par_iter().fold_chunks(3, id, sum).len());
+        assert_eq!(3, (0..8).into_par_iter().fold_chunks(3, id, sum).len());
+        assert_eq!(1, (&[1]).par_iter().fold_chunks(3, id, sum).len());
+        assert_eq!(0, (0..0).into_par_iter().fold_chunks(3, id, sum).len());
+    }
+
+    #[test]
+    fn check_fold_chunks_uneven() {
+        let cases: Vec<(Vec<u32>, usize, Vec<u32>)> = vec![
+            ((0..5).collect(), 3, vec![0 + 1 + 2, 3 + 4]),
+            (vec![1], 5, vec![1]),
+            ((0..4).collect(), 3, vec![0 + 1 + 2, 3]),
+        ];
+
+        for (i, (v, n, expected)) in cases.into_iter().enumerate() {
+            let mut res: Vec<u32> = vec![];
+            v.par_iter()
+                .fold_chunks(n, || 0, sum)
+                .collect_into_vec(&mut res);
+            assert_eq!(expected, res, "Case {} failed", i);
+
+            res.truncate(0);
+            v.into_par_iter()
+                .fold_chunks(n, || 0, sum)
+                .rev()
+                .collect_into_vec(&mut res);
+            assert_eq!(
+                expected.into_iter().rev().collect::<Vec<u32>>(),
+                res,
+                "Case {} reversed failed",
+                i
+            );
+        }
+    }
+}
diff --git a/src/iter/fold_chunks_with.rs b/src/iter/fold_chunks_with.rs
new file mode 100644
index 0000000..af120ae
--- /dev/null
+++ b/src/iter/fold_chunks_with.rs
@@ -0,0 +1,231 @@
+use std::fmt::{self, Debug};
+
+use super::chunks::ChunkProducer;
+use super::plumbing::*;
+use super::*;
+use crate::math::div_round_up;
+
+/// `FoldChunksWith` is an iterator that groups elements of an underlying iterator and applies a
+/// function over them, producing a single value for each group.
+///
+/// This struct is created by the [`fold_chunks_with()`] method on [`IndexedParallelIterator`]
+///
+/// [`fold_chunks_with()`]: trait.IndexedParallelIterator.html#method.fold_chunks
+/// [`IndexedParallelIterator`]: trait.IndexedParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone)]
+pub struct FoldChunksWith<I, U, F>
+where
+    I: IndexedParallelIterator,
+{
+    base: I,
+    chunk_size: usize,
+    item: U,
+    fold_op: F,
+}
+
+impl<I: IndexedParallelIterator + Debug, U: Debug, F> Debug for FoldChunksWith<I, U, F> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Fold")
+            .field("base", &self.base)
+            .field("chunk_size", &self.chunk_size)
+            .field("item", &self.item)
+            .finish()
+    }
+}
+
+impl<I, U, F> FoldChunksWith<I, U, F>
+where
+    I: IndexedParallelIterator,
+    U: Send + Clone,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+{
+    /// Creates a new `FoldChunksWith` iterator
+    pub(super) fn new(base: I, chunk_size: usize, item: U, fold_op: F) -> Self {
+        FoldChunksWith {
+            base,
+            chunk_size,
+            item,
+            fold_op,
+        }
+    }
+}
+
+impl<I, U, F> ParallelIterator for FoldChunksWith<I, U, F>
+where
+    I: IndexedParallelIterator,
+    U: Send + Clone,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+{
+    type Item = U;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<U>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<I, U, F> IndexedParallelIterator for FoldChunksWith<I, U, F>
+where
+    I: IndexedParallelIterator,
+    U: Send + Clone,
+    F: Fn(U, I::Item) -> U + Send + Sync,
+{
+    fn len(&self) -> usize {
+        div_round_up(self.base.len(), self.chunk_size)
+    }
+
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        let len = self.base.len();
+        return self.base.with_producer(Callback {
+            chunk_size: self.chunk_size,
+            len,
+            item: self.item,
+            fold_op: self.fold_op,
+            callback,
+        });
+
+        struct Callback<CB, T, F> {
+            chunk_size: usize,
+            len: usize,
+            item: T,
+            fold_op: F,
+            callback: CB,
+        }
+
+        impl<T, U, F, CB> ProducerCallback<T> for Callback<CB, U, F>
+        where
+            CB: ProducerCallback<U>,
+            U: Send + Clone,
+            F: Fn(U, T) -> U + Send + Sync,
+        {
+            type Output = CB::Output;
+
+            fn callback<P>(self, base: P) -> CB::Output
+            where
+                P: Producer<Item = T>,
+            {
+                let item = self.item;
+                let fold_op = &self.fold_op;
+                let fold_iter = move |iter: P::IntoIter| iter.fold(item.clone(), fold_op);
+                let producer = ChunkProducer::new(self.chunk_size, self.len, base, fold_iter);
+                self.callback.callback(producer)
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use std::ops::Add;
+
+    #[test]
+    fn check_fold_chunks_with() {
+        let words = "bishbashbosh!"
+            .chars()
+            .collect::<Vec<_>>()
+            .into_par_iter()
+            .fold_chunks_with(4, String::new(), |mut s, c| {
+                s.push(c);
+                s
+            })
+            .collect::<Vec<_>>();
+
+        assert_eq!(words, vec!["bish", "bash", "bosh", "!"]);
+    }
+
+    // 'closure' value for tests below
+    fn sum<T, U>(x: T, y: U) -> T
+    where
+        T: Add<U, Output = T>,
+    {
+        x + y
+    }
+
+    #[test]
+    #[should_panic(expected = "chunk_size must not be zero")]
+    fn check_fold_chunks_zero_size() {
+        let _: Vec<i32> = vec![1, 2, 3]
+            .into_par_iter()
+            .fold_chunks_with(0, 0, sum)
+            .collect();
+    }
+
+    #[test]
+    fn check_fold_chunks_even_size() {
+        assert_eq!(
+            vec![1 + 2 + 3, 4 + 5 + 6, 7 + 8 + 9],
+            (1..10)
+                .into_par_iter()
+                .fold_chunks_with(3, 0, sum)
+                .collect::<Vec<i32>>()
+        );
+    }
+
+    #[test]
+    fn check_fold_chunks_with_empty() {
+        let v: Vec<i32> = vec![];
+        let expected: Vec<i32> = vec![];
+        assert_eq!(
+            expected,
+            v.into_par_iter()
+                .fold_chunks_with(2, 0, sum)
+                .collect::<Vec<i32>>()
+        );
+    }
+
+    #[test]
+    fn check_fold_chunks_len() {
+        assert_eq!(4, (0..8).into_par_iter().fold_chunks_with(2, 0, sum).len());
+        assert_eq!(3, (0..9).into_par_iter().fold_chunks_with(3, 0, sum).len());
+        assert_eq!(3, (0..8).into_par_iter().fold_chunks_with(3, 0, sum).len());
+        assert_eq!(1, (&[1]).par_iter().fold_chunks_with(3, 0, sum).len());
+        assert_eq!(0, (0..0).into_par_iter().fold_chunks_with(3, 0, sum).len());
+    }
+
+    #[test]
+    fn check_fold_chunks_uneven() {
+        let cases: Vec<(Vec<u32>, usize, Vec<u32>)> = vec![
+            ((0..5).collect(), 3, vec![0 + 1 + 2, 3 + 4]),
+            (vec![1], 5, vec![1]),
+            ((0..4).collect(), 3, vec![0 + 1 + 2, 3]),
+        ];
+
+        for (i, (v, n, expected)) in cases.into_iter().enumerate() {
+            let mut res: Vec<u32> = vec![];
+            v.par_iter()
+                .fold_chunks_with(n, 0, sum)
+                .collect_into_vec(&mut res);
+            assert_eq!(expected, res, "Case {} failed", i);
+
+            res.truncate(0);
+            v.into_par_iter()
+                .fold_chunks_with(n, 0, sum)
+                .rev()
+                .collect_into_vec(&mut res);
+            assert_eq!(
+                expected.into_iter().rev().collect::<Vec<u32>>(),
+                res,
+                "Case {} reversed failed",
+                i
+            );
+        }
+    }
+}
diff --git a/src/iter/inspect.rs b/src/iter/inspect.rs
index 9b1cd09..c50ca02 100644
--- a/src/iter/inspect.rs
+++ b/src/iter/inspect.rs
@@ -209,7 +209,7 @@
     F: Fn(&T) + Sync,
 {
     fn split_off_left(&self) -> Self {
-        InspectConsumer::new(self.base.split_off_left(), &self.inspect_op)
+        InspectConsumer::new(self.base.split_off_left(), self.inspect_op)
     }
 
     fn to_reducer(&self) -> Self::Reducer {
diff --git a/src/iter/interleave.rs b/src/iter/interleave.rs
index b5d43d5..3cacc49 100644
--- a/src/iter/interleave.rs
+++ b/src/iter/interleave.rs
@@ -310,16 +310,16 @@
 {
     #[inline]
     fn next_back(&mut self) -> Option<I::Item> {
-        if self.i.len() == self.j.len() {
-            if self.i_next {
-                self.i.next_back()
-            } else {
-                self.j.next_back()
+        match self.i.len().cmp(&self.j.len()) {
+            Ordering::Less => self.j.next_back(),
+            Ordering::Equal => {
+                if self.i_next {
+                    self.i.next_back()
+                } else {
+                    self.j.next_back()
+                }
             }
-        } else if self.i.len() < self.j.len() {
-            self.j.next_back()
-        } else {
-            self.i.next_back()
+            Ordering::Greater => self.i.next_back(),
         }
     }
 }
diff --git a/src/iter/len.rs b/src/iter/len.rs
index e65b3c0..8ec7f33 100644
--- a/src/iter/len.rs
+++ b/src/iter/len.rs
@@ -3,9 +3,9 @@
 use std::cmp;
 
 /// `MinLen` is an iterator that imposes a minimum length on iterator splits.
-/// This struct is created by the [`min_len()`] method on [`IndexedParallelIterator`]
+/// This struct is created by the [`with_min_len()`] method on [`IndexedParallelIterator`]
 ///
-/// [`min_len()`]: trait.IndexedParallelIterator.html#method.min_len
+/// [`with_min_len()`]: trait.IndexedParallelIterator.html#method.with_min_len
 /// [`IndexedParallelIterator`]: trait.IndexedParallelIterator.html
 #[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
 #[derive(Debug, Clone)]
@@ -137,9 +137,9 @@
 }
 
 /// `MaxLen` is an iterator that imposes a maximum length on iterator splits.
-/// This struct is created by the [`max_len()`] method on [`IndexedParallelIterator`]
+/// This struct is created by the [`with_max_len()`] method on [`IndexedParallelIterator`]
 ///
-/// [`max_len()`]: trait.IndexedParallelIterator.html#method.max_len
+/// [`with_max_len()`]: trait.IndexedParallelIterator.html#method.with_max_len
 /// [`IndexedParallelIterator`]: trait.IndexedParallelIterator.html
 #[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
 #[derive(Debug, Clone)]
diff --git a/src/iter/map.rs b/src/iter/map.rs
index f2a35ff..da14d40 100644
--- a/src/iter/map.rs
+++ b/src/iter/map.rs
@@ -213,7 +213,7 @@
     R: Send,
 {
     fn split_off_left(&self) -> Self {
-        MapConsumer::new(self.base.split_off_left(), &self.map_op)
+        MapConsumer::new(self.base.split_off_left(), self.map_op)
     }
 
     fn to_reducer(&self) -> Self::Reducer {
diff --git a/src/iter/mod.rs b/src/iter/mod.rs
index edff1a6..e60ea16 100644
--- a/src/iter/mod.rs
+++ b/src/iter/mod.rs
@@ -119,6 +119,8 @@
 mod flatten;
 mod flatten_iter;
 mod fold;
+mod fold_chunks;
+mod fold_chunks_with;
 mod for_each;
 mod from_par_iter;
 mod inspect;
@@ -139,9 +141,14 @@
 mod repeat;
 mod rev;
 mod skip;
+mod skip_any;
+mod skip_any_while;
 mod splitter;
+mod step_by;
 mod sum;
 mod take;
+mod take_any;
+mod take_any_while;
 mod try_fold;
 mod try_reduce;
 mod try_reduce_with;
@@ -165,6 +172,8 @@
     flatten::Flatten,
     flatten_iter::FlattenIter,
     fold::{Fold, FoldWith},
+    fold_chunks::FoldChunks,
+    fold_chunks_with::FoldChunksWith,
     inspect::Inspect,
     interleave::Interleave,
     interleave_shortest::InterleaveShortest,
@@ -180,8 +189,13 @@
     repeat::{repeat, repeatn, Repeat, RepeatN},
     rev::Rev,
     skip::Skip,
+    skip_any::SkipAny,
+    skip_any_while::SkipAnyWhile,
     splitter::{split, Split},
+    step_by::StepBy,
     take::Take,
+    take_any::TakeAny,
+    take_any_while::TakeAnyWhile,
     try_fold::{TryFold, TryFoldWith},
     update::Update,
     while_some::WhileSome,
@@ -189,10 +203,6 @@
     zip_eq::ZipEq,
 };
 
-mod step_by;
-#[cfg(step_by)]
-pub use self::step_by::StepBy;
-
 /// `IntoParallelIterator` implements the conversion to a [`ParallelIterator`].
 ///
 /// By implementing `IntoParallelIterator` for a type, you define how it will
@@ -457,10 +467,10 @@
     fn try_for_each<OP, R>(self, op: OP) -> R
     where
         OP: Fn(Self::Item) -> R + Sync + Send,
-        R: Try<Ok = ()> + Send,
+        R: Try<Output = ()> + Send,
     {
-        fn ok<R: Try<Ok = ()>>(_: (), _: ()) -> R {
-            R::from_ok(())
+        fn ok<R: Try<Output = ()>>(_: (), _: ()) -> R {
+            R::from_output(())
         }
 
         self.map(op).try_reduce(<()>::default, ok)
@@ -497,10 +507,10 @@
     where
         OP: Fn(&mut T, Self::Item) -> R + Sync + Send,
         T: Send + Clone,
-        R: Try<Ok = ()> + Send,
+        R: Try<Output = ()> + Send,
     {
-        fn ok<R: Try<Ok = ()>>(_: (), _: ()) -> R {
-            R::from_ok(())
+        fn ok<R: Try<Output = ()>>(_: (), _: ()) -> R {
+            R::from_output(())
         }
 
         self.map_with(init, op).try_reduce(<()>::default, ok)
@@ -539,10 +549,10 @@
     where
         OP: Fn(&mut T, Self::Item) -> R + Sync + Send,
         INIT: Fn() -> T + Sync + Send,
-        R: Try<Ok = ()> + Send,
+        R: Try<Output = ()> + Send,
     {
-        fn ok<R: Try<Ok = ()>>(_: (), _: ()) -> R {
-            R::from_ok(())
+        fn ok<R: Try<Output = ()>>(_: (), _: ()) -> R {
+            R::from_output(())
         }
 
         self.map_init(init, op).try_reduce(<()>::default, ok)
@@ -921,7 +931,7 @@
 
     /// An adaptor that flattens serial-iterable `Item`s into one large iterator.
     ///
-    /// See also [`flatten`](#method.flatten) and the analagous comparison of
+    /// See also [`flatten`](#method.flatten) and the analogous comparison of
     /// [`flat_map_iter` versus `flat_map`](#flat_map_iter-versus-flat_map).
     ///
     /// # Examples
@@ -1065,7 +1075,7 @@
     where
         OP: Fn(T, T) -> Self::Item + Sync + Send,
         ID: Fn() -> T + Sync + Send,
-        Self::Item: Try<Ok = T>,
+        Self::Item: Try<Output = T>,
     {
         try_reduce::try_reduce(self, identity, op)
     }
@@ -1108,7 +1118,7 @@
     fn try_reduce_with<T, OP>(self, op: OP) -> Option<Self::Item>
     where
         OP: Fn(T, T) -> Self::Item + Sync + Send,
-        Self::Item: Try<Ok = T>,
+        Self::Item: Try<Output = T>,
     {
         try_reduce_with::try_reduce_with(self, op)
     }
@@ -1124,7 +1134,7 @@
     /// multiple sums. The number of results is nondeterministic, as
     /// is the point where the breaks occur.
     ///
-    /// So if did the same parallel fold (`fold(0, |a,b| a+b)`) on
+    /// So if we did the same parallel fold (`fold(0, |a,b| a+b)`) on
     /// our example list, we might wind up with a sequence of two numbers,
     /// like so:
     ///
@@ -1311,7 +1321,7 @@
     where
         F: Fn(T, Self::Item) -> R + Sync + Send,
         ID: Fn() -> T + Sync + Send,
-        R: Try<Ok = T> + Send,
+        R: Try<Output = T> + Send,
     {
         TryFold::new(self, identity, fold_op)
     }
@@ -1337,7 +1347,7 @@
     fn try_fold_with<F, T, R>(self, init: T, fold_op: F) -> TryFoldWith<Self, R, F>
     where
         F: Fn(T, Self::Item) -> R + Sync + Send,
-        R: Try<Ok = T> + Send,
+        R: Try<Output = T> + Send,
         T: Clone + Send,
     {
         TryFoldWith::new(self, init, fold_op)
@@ -2098,7 +2108,7 @@
     /// Note: unlike the standard `Iterator::partition`, this allows distinct
     /// collection types for the left and right items.  This is more flexible,
     /// but may require new type annotations when converting sequential code
-    /// that used type inferrence assuming the two were the same.
+    /// that used type inference assuming the two were the same.
     ///
     /// # Examples
     ///
@@ -2192,6 +2202,143 @@
         Intersperse::new(self, element)
     }
 
+    /// Creates an iterator that yields `n` elements from *anywhere* in the original iterator.
+    ///
+    /// This is similar to [`IndexedParallelIterator::take`] without being
+    /// constrained to the "first" `n` of the original iterator order. The
+    /// taken items will still maintain their relative order where that is
+    /// visible in `collect`, `reduce`, and similar outputs.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let result: Vec<_> = (0..100)
+    ///     .into_par_iter()
+    ///     .filter(|&x| x % 2 == 0)
+    ///     .take_any(5)
+    ///     .collect();
+    ///
+    /// assert_eq!(result.len(), 5);
+    /// assert!(result.windows(2).all(|w| w[0] < w[1]));
+    /// ```
+    fn take_any(self, n: usize) -> TakeAny<Self> {
+        TakeAny::new(self, n)
+    }
+
+    /// Creates an iterator that skips `n` elements from *anywhere* in the original iterator.
+    ///
+    /// This is similar to [`IndexedParallelIterator::skip`] without being
+    /// constrained to the "first" `n` of the original iterator order. The
+    /// remaining items will still maintain their relative order where that is
+    /// visible in `collect`, `reduce`, and similar outputs.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let result: Vec<_> = (0..100)
+    ///     .into_par_iter()
+    ///     .filter(|&x| x % 2 == 0)
+    ///     .skip_any(5)
+    ///     .collect();
+    ///
+    /// assert_eq!(result.len(), 45);
+    /// assert!(result.windows(2).all(|w| w[0] < w[1]));
+    /// ```
+    fn skip_any(self, n: usize) -> SkipAny<Self> {
+        SkipAny::new(self, n)
+    }
+
+    /// Creates an iterator that takes elements from *anywhere* in the original iterator
+    /// until the given `predicate` returns `false`.
+    ///
+    /// The `predicate` may be anything -- e.g. it could be checking a fact about the item, a
+    /// global condition unrelated to the item itself, or some combination thereof.
+    ///
+    /// If parallel calls to the `predicate` race and give different results, then the
+    /// `true` results will still take those particular items, while respecting the `false`
+    /// result from elsewhere to skip any further items.
+    ///
+    /// This is similar to [`Iterator::take_while`] without being constrained to the original
+    /// iterator order. The taken items will still maintain their relative order where that is
+    /// visible in `collect`, `reduce`, and similar outputs.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let result: Vec<_> = (0..100)
+    ///     .into_par_iter()
+    ///     .take_any_while(|x| *x < 50)
+    ///     .collect();
+    ///
+    /// assert!(result.len() <= 50);
+    /// assert!(result.windows(2).all(|w| w[0] < w[1]));
+    /// ```
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// use std::sync::atomic::AtomicUsize;
+    /// use std::sync::atomic::Ordering::Relaxed;
+    ///
+    /// // Collect any group of items that sum <= 1000
+    /// let quota = AtomicUsize::new(1000);
+    /// let result: Vec<_> = (0_usize..100)
+    ///     .into_par_iter()
+    ///     .take_any_while(|&x| {
+    ///         quota.fetch_update(Relaxed, Relaxed, |q| q.checked_sub(x))
+    ///             .is_ok()
+    ///     })
+    ///     .collect();
+    ///
+    /// let sum = result.iter().sum::<usize>();
+    /// assert!(matches!(sum, 902..=1000));
+    /// ```
+    fn take_any_while<P>(self, predicate: P) -> TakeAnyWhile<Self, P>
+    where
+        P: Fn(&Self::Item) -> bool + Sync + Send,
+    {
+        TakeAnyWhile::new(self, predicate)
+    }
+
+    /// Creates an iterator that skips elements from *anywhere* in the original iterator
+    /// until the given `predicate` returns `false`.
+    ///
+    /// The `predicate` may be anything -- e.g. it could be checking a fact about the item, a
+    /// global condition unrelated to the item itself, or some combination thereof.
+    ///
+    /// If parallel calls to the `predicate` race and give different results, then the
+    /// `true` results will still skip those particular items, while respecting the `false`
+    /// result from elsewhere to skip any further items.
+    ///
+    /// This is similar to [`Iterator::skip_while`] without being constrained to the original
+    /// iterator order. The remaining items will still maintain their relative order where that is
+    /// visible in `collect`, `reduce`, and similar outputs.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let result: Vec<_> = (0..100)
+    ///     .into_par_iter()
+    ///     .skip_any_while(|x| *x < 50)
+    ///     .collect();
+    ///
+    /// assert!(result.len() >= 50);
+    /// assert!(result.windows(2).all(|w| w[0] < w[1]));
+    /// ```
+    fn skip_any_while<P>(self, predicate: P) -> SkipAnyWhile<Self, P>
+    where
+        P: Fn(&Self::Item) -> bool + Sync + Send,
+    {
+        SkipAnyWhile::new(self, predicate)
+    }
+
     /// Internal method used to define the behavior of this parallel
     /// iterator. You should not need to call this directly.
     ///
@@ -2241,6 +2388,8 @@
 /// those points.
 ///
 /// **Note:** Not implemented for `u64`, `i64`, `u128`, or `i128` ranges
+// Waiting for `ExactSizeIterator::is_empty` to be stabilized. See rust-lang/rust#35428
+#[allow(clippy::len_without_is_empty)]
 pub trait IndexedParallelIterator: ParallelIterator {
     /// Collects the results of the iterator into the specified
     /// vector. The vector is always truncated before execution
@@ -2339,13 +2488,18 @@
     /// // we should never get here
     /// assert_eq!(1, zipped.len());
     /// ```
+    #[track_caller]
     fn zip_eq<Z>(self, zip_op: Z) -> ZipEq<Self, Z::Iter>
     where
         Z: IntoParallelIterator,
         Z::Iter: IndexedParallelIterator,
     {
         let zip_op_iter = zip_op.into_par_iter();
-        assert_eq!(self.len(), zip_op_iter.len());
+        assert_eq!(
+            self.len(),
+            zip_op_iter.len(),
+            "iterators must have the same length"
+        );
         ZipEq::new(self, zip_op_iter)
     }
 
@@ -2410,11 +2564,95 @@
     /// let r: Vec<Vec<i32>> = a.into_par_iter().chunks(3).collect();
     /// assert_eq!(r, vec![vec![1,2,3], vec![4,5,6], vec![7,8,9], vec![10]]);
     /// ```
+    #[track_caller]
     fn chunks(self, chunk_size: usize) -> Chunks<Self> {
         assert!(chunk_size != 0, "chunk_size must not be zero");
         Chunks::new(self, chunk_size)
     }
 
+    /// Splits an iterator into fixed-size chunks, performing a sequential [`fold()`] on
+    /// each chunk.
+    ///
+    /// Returns an iterator that produces a folded result for each chunk of items
+    /// produced by this iterator.
+    ///
+    /// This works essentially like:
+    ///
+    /// ```text
+    /// iter.chunks(chunk_size)
+    ///     .map(|chunk|
+    ///         chunk.into_iter()
+    ///             .fold(identity, fold_op)
+    ///     )
+    /// ```
+    ///
+    /// except there is no per-chunk allocation overhead.
+    ///
+    /// [`fold()`]: std::iter::Iterator#method.fold
+    ///
+    /// **Panics** if `chunk_size` is 0.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let nums = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+    /// let chunk_sums = nums.into_par_iter().fold_chunks(2, || 0, |a, n| a + n).collect::<Vec<_>>();
+    /// assert_eq!(chunk_sums, vec![3, 7, 11, 15, 19]);
+    /// ```
+    #[track_caller]
+    fn fold_chunks<T, ID, F>(
+        self,
+        chunk_size: usize,
+        identity: ID,
+        fold_op: F,
+    ) -> FoldChunks<Self, ID, F>
+    where
+        ID: Fn() -> T + Send + Sync,
+        F: Fn(T, Self::Item) -> T + Send + Sync,
+        T: Send,
+    {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        FoldChunks::new(self, chunk_size, identity, fold_op)
+    }
+
+    /// Splits an iterator into fixed-size chunks, performing a sequential [`fold()`] on
+    /// each chunk.
+    ///
+    /// Returns an iterator that produces a folded result for each chunk of items
+    /// produced by this iterator.
+    ///
+    /// This works essentially like `fold_chunks(chunk_size, || init.clone(), fold_op)`,
+    /// except it doesn't require the `init` type to be `Sync`, nor any other form of
+    /// added synchronization.
+    ///
+    /// [`fold()`]: std::iter::Iterator#method.fold
+    ///
+    /// **Panics** if `chunk_size` is 0.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let nums = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
+    /// let chunk_sums = nums.into_par_iter().fold_chunks_with(2, 0, |a, n| a + n).collect::<Vec<_>>();
+    /// assert_eq!(chunk_sums, vec![3, 7, 11, 15, 19]);
+    /// ```
+    #[track_caller]
+    fn fold_chunks_with<T, F>(
+        self,
+        chunk_size: usize,
+        init: T,
+        fold_op: F,
+    ) -> FoldChunksWith<Self, T, F>
+    where
+        T: Send + Clone,
+        F: Fn(T, Self::Item) -> T + Send + Sync,
+    {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        FoldChunksWith::new(self, chunk_size, init, fold_op)
+    }
+
     /// Lexicographically compares the elements of this `ParallelIterator` with those of
     /// another.
     ///
@@ -2601,11 +2839,6 @@
     ///
     /// assert_eq!(result, [3, 6, 9])
     /// ```
-    ///
-    /// # Compatibility
-    ///
-    /// This method is only available on Rust 1.38 or greater.
-    #[cfg(step_by)]
     fn step_by(self, step: usize) -> StepBy<Self> {
         StepBy::new(self, step)
     }
@@ -2808,7 +3041,7 @@
     }
 
     /// Sets the minimum length of iterators desired to process in each
-    /// thread.  Rayon will not split any smaller than this length, but
+    /// rayon job.  Rayon will not split any smaller than this length, but
     /// of course an iterator could already be smaller to begin with.
     ///
     /// Producers like `zip` and `interleave` will use greater of the two
@@ -2834,7 +3067,7 @@
     }
 
     /// Sets the maximum length of iterators desired to process in each
-    /// thread.  Rayon will try to split at least below this length,
+    /// rayon job.  Rayon will try to split at least below this length,
     /// unless that would put it below the length from `with_min_len()`.
     /// For example, given min=10 and max=15, a length of 16 will not be
     /// split any further.
@@ -3145,50 +3378,154 @@
 /// We hide the `Try` trait in a private module, as it's only meant to be a
 /// stable clone of the standard library's `Try` trait, as yet unstable.
 mod private {
+    use std::convert::Infallible;
+    use std::ops::ControlFlow::{self, Break, Continue};
+    use std::task::Poll;
+
     /// Clone of `std::ops::Try`.
     ///
     /// Implementing this trait is not permitted outside of `rayon`.
     pub trait Try {
         private_decl! {}
 
-        type Ok;
-        type Error;
-        fn into_result(self) -> Result<Self::Ok, Self::Error>;
-        fn from_ok(v: Self::Ok) -> Self;
-        fn from_error(v: Self::Error) -> Self;
+        type Output;
+        type Residual;
+
+        fn from_output(output: Self::Output) -> Self;
+
+        fn from_residual(residual: Self::Residual) -> Self;
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output>;
+    }
+
+    impl<B, C> Try for ControlFlow<B, C> {
+        private_impl! {}
+
+        type Output = C;
+        type Residual = ControlFlow<B, Infallible>;
+
+        fn from_output(output: Self::Output) -> Self {
+            Continue(output)
+        }
+
+        fn from_residual(residual: Self::Residual) -> Self {
+            match residual {
+                Break(b) => Break(b),
+                Continue(_) => unreachable!(),
+            }
+        }
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
+            match self {
+                Continue(c) => Continue(c),
+                Break(b) => Break(Break(b)),
+            }
+        }
     }
 
     impl<T> Try for Option<T> {
         private_impl! {}
 
-        type Ok = T;
-        type Error = ();
+        type Output = T;
+        type Residual = Option<Infallible>;
 
-        fn into_result(self) -> Result<T, ()> {
-            self.ok_or(())
+        fn from_output(output: Self::Output) -> Self {
+            Some(output)
         }
-        fn from_ok(v: T) -> Self {
-            Some(v)
+
+        fn from_residual(residual: Self::Residual) -> Self {
+            match residual {
+                None => None,
+                Some(_) => unreachable!(),
+            }
         }
-        fn from_error(_: ()) -> Self {
-            None
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
+            match self {
+                Some(c) => Continue(c),
+                None => Break(None),
+            }
         }
     }
 
     impl<T, E> Try for Result<T, E> {
         private_impl! {}
 
-        type Ok = T;
-        type Error = E;
+        type Output = T;
+        type Residual = Result<Infallible, E>;
 
-        fn into_result(self) -> Result<T, E> {
-            self
+        fn from_output(output: Self::Output) -> Self {
+            Ok(output)
         }
-        fn from_ok(v: T) -> Self {
-            Ok(v)
+
+        fn from_residual(residual: Self::Residual) -> Self {
+            match residual {
+                Err(e) => Err(e),
+                Ok(_) => unreachable!(),
+            }
         }
-        fn from_error(v: E) -> Self {
-            Err(v)
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
+            match self {
+                Ok(c) => Continue(c),
+                Err(e) => Break(Err(e)),
+            }
+        }
+    }
+
+    impl<T, E> Try for Poll<Result<T, E>> {
+        private_impl! {}
+
+        type Output = Poll<T>;
+        type Residual = Result<Infallible, E>;
+
+        fn from_output(output: Self::Output) -> Self {
+            output.map(Ok)
+        }
+
+        fn from_residual(residual: Self::Residual) -> Self {
+            match residual {
+                Err(e) => Poll::Ready(Err(e)),
+                Ok(_) => unreachable!(),
+            }
+        }
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
+            match self {
+                Poll::Pending => Continue(Poll::Pending),
+                Poll::Ready(Ok(c)) => Continue(Poll::Ready(c)),
+                Poll::Ready(Err(e)) => Break(Err(e)),
+            }
+        }
+    }
+
+    impl<T, E> Try for Poll<Option<Result<T, E>>> {
+        private_impl! {}
+
+        type Output = Poll<Option<T>>;
+        type Residual = Result<Infallible, E>;
+
+        fn from_output(output: Self::Output) -> Self {
+            match output {
+                Poll::Ready(o) => Poll::Ready(o.map(Ok)),
+                Poll::Pending => Poll::Pending,
+            }
+        }
+
+        fn from_residual(residual: Self::Residual) -> Self {
+            match residual {
+                Err(e) => Poll::Ready(Some(Err(e))),
+                Ok(_) => unreachable!(),
+            }
+        }
+
+        fn branch(self) -> ControlFlow<Self::Residual, Self::Output> {
+            match self {
+                Poll::Pending => Continue(Poll::Pending),
+                Poll::Ready(None) => Continue(Poll::Ready(None)),
+                Poll::Ready(Some(Ok(c))) => Continue(Poll::Ready(Some(c))),
+                Poll::Ready(Some(Err(e))) => Break(Err(e)),
+            }
         }
     }
 }
diff --git a/src/iter/par_bridge.rs b/src/iter/par_bridge.rs
index 339ac1a..8398274 100644
--- a/src/iter/par_bridge.rs
+++ b/src/iter/par_bridge.rs
@@ -1,12 +1,9 @@
-use crossbeam_deque::{Steal, Stealer, Worker};
-
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
-use std::sync::{Mutex, TryLockError};
-use std::thread::yield_now;
+use std::sync::Mutex;
 
-use crate::current_num_threads;
 use crate::iter::plumbing::{bridge_unindexed, Folder, UnindexedConsumer, UnindexedProducer};
 use crate::iter::ParallelIterator;
+use crate::{current_num_threads, current_thread_index};
 
 /// Conversion trait to convert an `Iterator` to a `ParallelIterator`.
 ///
@@ -78,71 +75,46 @@
     where
         C: UnindexedConsumer<Self::Item>,
     {
-        let split_count = AtomicUsize::new(current_num_threads());
-        let worker = Worker::new_fifo();
-        let stealer = worker.stealer();
-        let done = AtomicBool::new(false);
-        let iter = Mutex::new((self.iter, worker));
+        let num_threads = current_num_threads();
+        let threads_started: Vec<_> = (0..num_threads).map(|_| AtomicBool::new(false)).collect();
 
         bridge_unindexed(
-            IterParallelProducer {
-                split_count: &split_count,
-                done: &done,
-                iter: &iter,
-                items: stealer,
+            &IterParallelProducer {
+                split_count: AtomicUsize::new(num_threads),
+                iter: Mutex::new(self.iter.fuse()),
+                threads_started: &threads_started,
             },
             consumer,
         )
     }
 }
 
-struct IterParallelProducer<'a, Iter: Iterator> {
-    split_count: &'a AtomicUsize,
-    done: &'a AtomicBool,
-    iter: &'a Mutex<(Iter, Worker<Iter::Item>)>,
-    items: Stealer<Iter::Item>,
+struct IterParallelProducer<'a, Iter> {
+    split_count: AtomicUsize,
+    iter: Mutex<std::iter::Fuse<Iter>>,
+    threads_started: &'a [AtomicBool],
 }
 
-// manual clone because T doesn't need to be Clone, but the derive assumes it should be
-impl<'a, Iter: Iterator + 'a> Clone for IterParallelProducer<'a, Iter> {
-    fn clone(&self) -> Self {
-        IterParallelProducer {
-            split_count: self.split_count,
-            done: self.done,
-            iter: self.iter,
-            items: self.items.clone(),
-        }
-    }
-}
-
-impl<'a, Iter: Iterator + Send + 'a> UnindexedProducer for IterParallelProducer<'a, Iter>
-where
-    Iter::Item: Send,
-{
+impl<Iter: Iterator + Send> UnindexedProducer for &IterParallelProducer<'_, Iter> {
     type Item = Iter::Item;
 
     fn split(self) -> (Self, Option<Self>) {
         let mut count = self.split_count.load(Ordering::SeqCst);
 
         loop {
-            // Check if the iterator is exhausted *and* we've consumed every item from it.
-            let done = self.done.load(Ordering::SeqCst) && self.items.is_empty();
-
-            match count.checked_sub(1) {
-                Some(new_count) if !done => {
-                    match self.split_count.compare_exchange_weak(
-                        count,
-                        new_count,
-                        Ordering::SeqCst,
-                        Ordering::SeqCst,
-                    ) {
-                        Ok(_) => return (self.clone(), Some(self)),
-                        Err(last_count) => count = last_count,
-                    }
+            // Check if the iterator is exhausted
+            if let Some(new_count) = count.checked_sub(1) {
+                match self.split_count.compare_exchange_weak(
+                    count,
+                    new_count,
+                    Ordering::SeqCst,
+                    Ordering::SeqCst,
+                ) {
+                    Ok(_) => return (self, Some(self)),
+                    Err(last_count) => count = last_count,
                 }
-                _ => {
-                    return (self, None);
-                }
+            } else {
+                return (self, None);
             }
         }
     }
@@ -151,66 +123,39 @@
     where
         F: Folder<Self::Item>,
     {
+        // Guard against work-stealing-induced recursion, in case `Iter::next()`
+        // calls rayon internally, so we don't deadlock our mutex. We might also
+        // be recursing via `folder` methods, which doesn't present a mutex hazard,
+        // but it's lower overhead for us to just check this once, rather than
+        // updating additional shared state on every mutex lock/unlock.
+        // (If this isn't a rayon thread, then there's no work-stealing anyway...)
+        if let Some(i) = current_thread_index() {
+            // Note: If the number of threads in the pool ever grows dynamically, then
+            // we'll end up sharing flags and may falsely detect recursion -- that's
+            // still fine for overall correctness, just not optimal for parallelism.
+            let thread_started = &self.threads_started[i % self.threads_started.len()];
+            if thread_started.swap(true, Ordering::Relaxed) {
+                // We can't make progress with a nested mutex, so just return and let
+                // the outermost loop continue with the rest of the iterator items.
+                return folder;
+            }
+        }
+
         loop {
-            match self.items.steal() {
-                Steal::Success(it) => {
+            if let Ok(mut iter) = self.iter.lock() {
+                if let Some(it) = iter.next() {
+                    drop(iter);
                     folder = folder.consume(it);
                     if folder.full() {
                         return folder;
                     }
+                } else {
+                    return folder;
                 }
-                Steal::Empty => {
-                    // Don't storm the mutex if we're already done.
-                    if self.done.load(Ordering::SeqCst) {
-                        // Someone might have pushed more between our `steal()` and `done.load()`
-                        if self.items.is_empty() {
-                            // The iterator is out of items, no use in continuing
-                            return folder;
-                        }
-                    } else {
-                        // our cache is out of items, time to load more from the iterator
-                        match self.iter.try_lock() {
-                            Ok(mut guard) => {
-                                // Check `done` again in case we raced with the previous lock
-                                // holder on its way out.
-                                if self.done.load(Ordering::SeqCst) {
-                                    if self.items.is_empty() {
-                                        return folder;
-                                    }
-                                    continue;
-                                }
-
-                                let count = current_num_threads();
-                                let count = (count * count) * 2;
-
-                                let (ref mut iter, ref worker) = *guard;
-
-                                // while worker.len() < count {
-                                // FIXME the new deque doesn't let us count items.  We can just
-                                // push a number of items, but that doesn't consider active
-                                // stealers elsewhere.
-                                for _ in 0..count {
-                                    if let Some(it) = iter.next() {
-                                        worker.push(it);
-                                    } else {
-                                        self.done.store(true, Ordering::SeqCst);
-                                        break;
-                                    }
-                                }
-                            }
-                            Err(TryLockError::WouldBlock) => {
-                                // someone else has the mutex, just sit tight until it's ready
-                                yield_now(); //TODO: use a thread-pool-aware yield? (#548)
-                            }
-                            Err(TryLockError::Poisoned(_)) => {
-                                // any panics from other threads will have been caught by the pool,
-                                // and will be re-thrown when joined - just exit
-                                return folder;
-                            }
-                        }
-                    }
-                }
-                Steal::Retry => (),
+            } else {
+                // any panics from other threads will have been caught by the pool,
+                // and will be re-thrown when joined - just exit
+                return folder;
             }
         }
     }
diff --git a/src/iter/plumbing/README.md b/src/iter/plumbing/README.md
index cd94eae..42d22ef 100644
--- a/src/iter/plumbing/README.md
+++ b/src/iter/plumbing/README.md
@@ -35,8 +35,8 @@
   more like a `for_each` call: each time a new item is produced, the
   `consume` method is called with that item. (The traits themselves are
   a bit more complex, as they support state that can be threaded
-  through and ultimately reduced.) Unlike producers, there are two
-  variants of consumers. The difference is how the split is performed:
+  through and ultimately reduced.) Like producers, there are two
+  variants of consumers which differ in how the split is performed:
   - in the `Consumer` trait, splitting is done with `split_at`, which
     accepts an index where the split should be performed. All
     iterators can work in this mode. The resulting halves thus have an
@@ -124,7 +124,7 @@
 
 The `bridge` function will then connect the consumer, which is
 handling the `flat_map` and `for_each`, with the producer, which is
-handling the `zip` and its preecessors. It will split down until the
+handling the `zip` and its predecessors. It will split down until the
 chunks seem reasonably small, then pull items from the producer and
 feed them to the consumer.
 
diff --git a/src/iter/skip.rs b/src/iter/skip.rs
index 9983f16..2d0f947 100644
--- a/src/iter/skip.rs
+++ b/src/iter/skip.rs
@@ -79,9 +79,16 @@
             where
                 P: Producer<Item = T>,
             {
-                let (before_skip, after_skip) = base.split_at(self.n);
-                bridge_producer_consumer(self.n, before_skip, NoopConsumer);
-                self.callback.callback(after_skip)
+                crate::in_place_scope(|scope| {
+                    let Self { callback, n } = self;
+                    let (before_skip, after_skip) = base.split_at(n);
+
+                    // Run the skipped part separately for side effects.
+                    // We'll still get any panics propagated back by the scope.
+                    scope.spawn(move |_| bridge_producer_consumer(n, before_skip, NoopConsumer));
+
+                    callback.callback(after_skip)
+                })
             }
         }
     }
diff --git a/src/iter/skip_any.rs b/src/iter/skip_any.rs
new file mode 100644
index 0000000..0660a56
--- /dev/null
+++ b/src/iter/skip_any.rs
@@ -0,0 +1,144 @@
+use super::plumbing::*;
+use super::*;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// `SkipAny` is an iterator that skips over `n` elements from anywhere in `I`.
+/// This struct is created by the [`skip_any()`] method on [`ParallelIterator`]
+///
+/// [`skip_any()`]: trait.ParallelIterator.html#method.skip_any
+/// [`ParallelIterator`]: trait.ParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone, Debug)]
+pub struct SkipAny<I: ParallelIterator> {
+    base: I,
+    count: usize,
+}
+
+impl<I> SkipAny<I>
+where
+    I: ParallelIterator,
+{
+    /// Creates a new `SkipAny` iterator.
+    pub(super) fn new(base: I, count: usize) -> Self {
+        SkipAny { base, count }
+    }
+}
+
+impl<I> ParallelIterator for SkipAny<I>
+where
+    I: ParallelIterator,
+{
+    type Item = I::Item;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        let consumer1 = SkipAnyConsumer {
+            base: consumer,
+            count: &AtomicUsize::new(self.count),
+        };
+        self.base.drive_unindexed(consumer1)
+    }
+}
+
+/// ////////////////////////////////////////////////////////////////////////
+/// Consumer implementation
+
+struct SkipAnyConsumer<'f, C> {
+    base: C,
+    count: &'f AtomicUsize,
+}
+
+impl<'f, T, C> Consumer<T> for SkipAnyConsumer<'f, C>
+where
+    C: Consumer<T>,
+    T: Send,
+{
+    type Folder = SkipAnyFolder<'f, C::Folder>;
+    type Reducer = C::Reducer;
+    type Result = C::Result;
+
+    fn split_at(self, index: usize) -> (Self, Self, Self::Reducer) {
+        let (left, right, reducer) = self.base.split_at(index);
+        (
+            SkipAnyConsumer { base: left, ..self },
+            SkipAnyConsumer {
+                base: right,
+                ..self
+            },
+            reducer,
+        )
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        SkipAnyFolder {
+            base: self.base.into_folder(),
+            count: self.count,
+        }
+    }
+
+    fn full(&self) -> bool {
+        self.base.full()
+    }
+}
+
+impl<'f, T, C> UnindexedConsumer<T> for SkipAnyConsumer<'f, C>
+where
+    C: UnindexedConsumer<T>,
+    T: Send,
+{
+    fn split_off_left(&self) -> Self {
+        SkipAnyConsumer {
+            base: self.base.split_off_left(),
+            ..*self
+        }
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        self.base.to_reducer()
+    }
+}
+
+struct SkipAnyFolder<'f, C> {
+    base: C,
+    count: &'f AtomicUsize,
+}
+
+fn checked_decrement(u: &AtomicUsize) -> bool {
+    u.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |u| u.checked_sub(1))
+        .is_ok()
+}
+
+impl<'f, T, C> Folder<T> for SkipAnyFolder<'f, C>
+where
+    C: Folder<T>,
+{
+    type Result = C::Result;
+
+    fn consume(mut self, item: T) -> Self {
+        if !checked_decrement(self.count) {
+            self.base = self.base.consume(item);
+        }
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.base = self.base.consume_iter(
+            iter.into_iter()
+                .skip_while(move |_| checked_decrement(self.count)),
+        );
+        self
+    }
+
+    fn complete(self) -> C::Result {
+        self.base.complete()
+    }
+
+    fn full(&self) -> bool {
+        self.base.full()
+    }
+}
diff --git a/src/iter/skip_any_while.rs b/src/iter/skip_any_while.rs
new file mode 100644
index 0000000..28b9e59
--- /dev/null
+++ b/src/iter/skip_any_while.rs
@@ -0,0 +1,166 @@
+use super::plumbing::*;
+use super::*;
+use std::fmt;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// `SkipAnyWhile` is an iterator that skips over elements from anywhere in `I`
+/// until the callback returns `false`.
+/// This struct is created by the [`skip_any_while()`] method on [`ParallelIterator`]
+///
+/// [`skip_any_while()`]: trait.ParallelIterator.html#method.skip_any_while
+/// [`ParallelIterator`]: trait.ParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone)]
+pub struct SkipAnyWhile<I: ParallelIterator, P> {
+    base: I,
+    predicate: P,
+}
+
+impl<I: ParallelIterator + fmt::Debug, P> fmt::Debug for SkipAnyWhile<I, P> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("SkipAnyWhile")
+            .field("base", &self.base)
+            .finish()
+    }
+}
+
+impl<I, P> SkipAnyWhile<I, P>
+where
+    I: ParallelIterator,
+{
+    /// Creates a new `SkipAnyWhile` iterator.
+    pub(super) fn new(base: I, predicate: P) -> Self {
+        SkipAnyWhile { base, predicate }
+    }
+}
+
+impl<I, P> ParallelIterator for SkipAnyWhile<I, P>
+where
+    I: ParallelIterator,
+    P: Fn(&I::Item) -> bool + Sync + Send,
+{
+    type Item = I::Item;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        let consumer1 = SkipAnyWhileConsumer {
+            base: consumer,
+            predicate: &self.predicate,
+            skipping: &AtomicBool::new(true),
+        };
+        self.base.drive_unindexed(consumer1)
+    }
+}
+
+/// ////////////////////////////////////////////////////////////////////////
+/// Consumer implementation
+
+struct SkipAnyWhileConsumer<'p, C, P> {
+    base: C,
+    predicate: &'p P,
+    skipping: &'p AtomicBool,
+}
+
+impl<'p, T, C, P> Consumer<T> for SkipAnyWhileConsumer<'p, C, P>
+where
+    C: Consumer<T>,
+    P: Fn(&T) -> bool + Sync,
+{
+    type Folder = SkipAnyWhileFolder<'p, C::Folder, P>;
+    type Reducer = C::Reducer;
+    type Result = C::Result;
+
+    fn split_at(self, index: usize) -> (Self, Self, Self::Reducer) {
+        let (left, right, reducer) = self.base.split_at(index);
+        (
+            SkipAnyWhileConsumer { base: left, ..self },
+            SkipAnyWhileConsumer {
+                base: right,
+                ..self
+            },
+            reducer,
+        )
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        SkipAnyWhileFolder {
+            base: self.base.into_folder(),
+            predicate: self.predicate,
+            skipping: self.skipping,
+        }
+    }
+
+    fn full(&self) -> bool {
+        self.base.full()
+    }
+}
+
+impl<'p, T, C, P> UnindexedConsumer<T> for SkipAnyWhileConsumer<'p, C, P>
+where
+    C: UnindexedConsumer<T>,
+    P: Fn(&T) -> bool + Sync,
+{
+    fn split_off_left(&self) -> Self {
+        SkipAnyWhileConsumer {
+            base: self.base.split_off_left(),
+            ..*self
+        }
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        self.base.to_reducer()
+    }
+}
+
+struct SkipAnyWhileFolder<'p, C, P> {
+    base: C,
+    predicate: &'p P,
+    skipping: &'p AtomicBool,
+}
+
+fn skip<T>(item: &T, skipping: &AtomicBool, predicate: &impl Fn(&T) -> bool) -> bool {
+    if !skipping.load(Ordering::Relaxed) {
+        return false;
+    }
+    if predicate(item) {
+        return true;
+    }
+    skipping.store(false, Ordering::Relaxed);
+    false
+}
+
+impl<'p, T, C, P> Folder<T> for SkipAnyWhileFolder<'p, C, P>
+where
+    C: Folder<T>,
+    P: Fn(&T) -> bool + 'p,
+{
+    type Result = C::Result;
+
+    fn consume(mut self, item: T) -> Self {
+        if !skip(&item, self.skipping, self.predicate) {
+            self.base = self.base.consume(item);
+        }
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.base = self.base.consume_iter(
+            iter.into_iter()
+                .skip_while(move |x| skip(x, self.skipping, self.predicate)),
+        );
+        self
+    }
+
+    fn complete(self) -> C::Result {
+        self.base.complete()
+    }
+
+    fn full(&self) -> bool {
+        self.base.full()
+    }
+}
diff --git a/src/iter/step_by.rs b/src/iter/step_by.rs
index 2002f42..94b8334 100644
--- a/src/iter/step_by.rs
+++ b/src/iter/step_by.rs
@@ -1,4 +1,3 @@
-#![cfg(step_by)]
 use std::cmp::min;
 
 use super::plumbing::*;
diff --git a/src/iter/take_any.rs b/src/iter/take_any.rs
new file mode 100644
index 0000000..e3992b3
--- /dev/null
+++ b/src/iter/take_any.rs
@@ -0,0 +1,144 @@
+use super::plumbing::*;
+use super::*;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+/// `TakeAny` is an iterator that iterates over `n` elements from anywhere in `I`.
+/// This struct is created by the [`take_any()`] method on [`ParallelIterator`]
+///
+/// [`take_any()`]: trait.ParallelIterator.html#method.take_any
+/// [`ParallelIterator`]: trait.ParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone, Debug)]
+pub struct TakeAny<I: ParallelIterator> {
+    base: I,
+    count: usize,
+}
+
+impl<I> TakeAny<I>
+where
+    I: ParallelIterator,
+{
+    /// Creates a new `TakeAny` iterator.
+    pub(super) fn new(base: I, count: usize) -> Self {
+        TakeAny { base, count }
+    }
+}
+
+impl<I> ParallelIterator for TakeAny<I>
+where
+    I: ParallelIterator,
+{
+    type Item = I::Item;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        let consumer1 = TakeAnyConsumer {
+            base: consumer,
+            count: &AtomicUsize::new(self.count),
+        };
+        self.base.drive_unindexed(consumer1)
+    }
+}
+
+/// ////////////////////////////////////////////////////////////////////////
+/// Consumer implementation
+
+struct TakeAnyConsumer<'f, C> {
+    base: C,
+    count: &'f AtomicUsize,
+}
+
+impl<'f, T, C> Consumer<T> for TakeAnyConsumer<'f, C>
+where
+    C: Consumer<T>,
+    T: Send,
+{
+    type Folder = TakeAnyFolder<'f, C::Folder>;
+    type Reducer = C::Reducer;
+    type Result = C::Result;
+
+    fn split_at(self, index: usize) -> (Self, Self, Self::Reducer) {
+        let (left, right, reducer) = self.base.split_at(index);
+        (
+            TakeAnyConsumer { base: left, ..self },
+            TakeAnyConsumer {
+                base: right,
+                ..self
+            },
+            reducer,
+        )
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        TakeAnyFolder {
+            base: self.base.into_folder(),
+            count: self.count,
+        }
+    }
+
+    fn full(&self) -> bool {
+        self.count.load(Ordering::Relaxed) == 0 || self.base.full()
+    }
+}
+
+impl<'f, T, C> UnindexedConsumer<T> for TakeAnyConsumer<'f, C>
+where
+    C: UnindexedConsumer<T>,
+    T: Send,
+{
+    fn split_off_left(&self) -> Self {
+        TakeAnyConsumer {
+            base: self.base.split_off_left(),
+            ..*self
+        }
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        self.base.to_reducer()
+    }
+}
+
+struct TakeAnyFolder<'f, C> {
+    base: C,
+    count: &'f AtomicUsize,
+}
+
+fn checked_decrement(u: &AtomicUsize) -> bool {
+    u.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |u| u.checked_sub(1))
+        .is_ok()
+}
+
+impl<'f, T, C> Folder<T> for TakeAnyFolder<'f, C>
+where
+    C: Folder<T>,
+{
+    type Result = C::Result;
+
+    fn consume(mut self, item: T) -> Self {
+        if checked_decrement(self.count) {
+            self.base = self.base.consume(item);
+        }
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.base = self.base.consume_iter(
+            iter.into_iter()
+                .take_while(move |_| checked_decrement(self.count)),
+        );
+        self
+    }
+
+    fn complete(self) -> C::Result {
+        self.base.complete()
+    }
+
+    fn full(&self) -> bool {
+        self.count.load(Ordering::Relaxed) == 0 || self.base.full()
+    }
+}
diff --git a/src/iter/take_any_while.rs b/src/iter/take_any_while.rs
new file mode 100644
index 0000000..e6a91af
--- /dev/null
+++ b/src/iter/take_any_while.rs
@@ -0,0 +1,166 @@
+use super::plumbing::*;
+use super::*;
+use std::fmt;
+use std::sync::atomic::{AtomicBool, Ordering};
+
+/// `TakeAnyWhile` is an iterator that iterates over elements from anywhere in `I`
+/// until the callback returns `false`.
+/// This struct is created by the [`take_any_while()`] method on [`ParallelIterator`]
+///
+/// [`take_any_while()`]: trait.ParallelIterator.html#method.take_any_while
+/// [`ParallelIterator`]: trait.ParallelIterator.html
+#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
+#[derive(Clone)]
+pub struct TakeAnyWhile<I: ParallelIterator, P> {
+    base: I,
+    predicate: P,
+}
+
+impl<I: ParallelIterator + fmt::Debug, P> fmt::Debug for TakeAnyWhile<I, P> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("TakeAnyWhile")
+            .field("base", &self.base)
+            .finish()
+    }
+}
+
+impl<I, P> TakeAnyWhile<I, P>
+where
+    I: ParallelIterator,
+{
+    /// Creates a new `TakeAnyWhile` iterator.
+    pub(super) fn new(base: I, predicate: P) -> Self {
+        TakeAnyWhile { base, predicate }
+    }
+}
+
+impl<I, P> ParallelIterator for TakeAnyWhile<I, P>
+where
+    I: ParallelIterator,
+    P: Fn(&I::Item) -> bool + Sync + Send,
+{
+    type Item = I::Item;
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        let consumer1 = TakeAnyWhileConsumer {
+            base: consumer,
+            predicate: &self.predicate,
+            taking: &AtomicBool::new(true),
+        };
+        self.base.drive_unindexed(consumer1)
+    }
+}
+
+/// ////////////////////////////////////////////////////////////////////////
+/// Consumer implementation
+
+struct TakeAnyWhileConsumer<'p, C, P> {
+    base: C,
+    predicate: &'p P,
+    taking: &'p AtomicBool,
+}
+
+impl<'p, T, C, P> Consumer<T> for TakeAnyWhileConsumer<'p, C, P>
+where
+    C: Consumer<T>,
+    P: Fn(&T) -> bool + Sync,
+{
+    type Folder = TakeAnyWhileFolder<'p, C::Folder, P>;
+    type Reducer = C::Reducer;
+    type Result = C::Result;
+
+    fn split_at(self, index: usize) -> (Self, Self, Self::Reducer) {
+        let (left, right, reducer) = self.base.split_at(index);
+        (
+            TakeAnyWhileConsumer { base: left, ..self },
+            TakeAnyWhileConsumer {
+                base: right,
+                ..self
+            },
+            reducer,
+        )
+    }
+
+    fn into_folder(self) -> Self::Folder {
+        TakeAnyWhileFolder {
+            base: self.base.into_folder(),
+            predicate: self.predicate,
+            taking: self.taking,
+        }
+    }
+
+    fn full(&self) -> bool {
+        !self.taking.load(Ordering::Relaxed) || self.base.full()
+    }
+}
+
+impl<'p, T, C, P> UnindexedConsumer<T> for TakeAnyWhileConsumer<'p, C, P>
+where
+    C: UnindexedConsumer<T>,
+    P: Fn(&T) -> bool + Sync,
+{
+    fn split_off_left(&self) -> Self {
+        TakeAnyWhileConsumer {
+            base: self.base.split_off_left(),
+            ..*self
+        }
+    }
+
+    fn to_reducer(&self) -> Self::Reducer {
+        self.base.to_reducer()
+    }
+}
+
+struct TakeAnyWhileFolder<'p, C, P> {
+    base: C,
+    predicate: &'p P,
+    taking: &'p AtomicBool,
+}
+
+fn take<T>(item: &T, taking: &AtomicBool, predicate: &impl Fn(&T) -> bool) -> bool {
+    if !taking.load(Ordering::Relaxed) {
+        return false;
+    }
+    if predicate(item) {
+        return true;
+    }
+    taking.store(false, Ordering::Relaxed);
+    false
+}
+
+impl<'p, T, C, P> Folder<T> for TakeAnyWhileFolder<'p, C, P>
+where
+    C: Folder<T>,
+    P: Fn(&T) -> bool + 'p,
+{
+    type Result = C::Result;
+
+    fn consume(mut self, item: T) -> Self {
+        if take(&item, self.taking, self.predicate) {
+            self.base = self.base.consume(item);
+        }
+        self
+    }
+
+    fn consume_iter<I>(mut self, iter: I) -> Self
+    where
+        I: IntoIterator<Item = T>,
+    {
+        self.base = self.base.consume_iter(
+            iter.into_iter()
+                .take_while(move |x| take(x, self.taking, self.predicate)),
+        );
+        self
+    }
+
+    fn complete(self) -> C::Result {
+        self.base.complete()
+    }
+
+    fn full(&self) -> bool {
+        !self.taking.load(Ordering::Relaxed) || self.base.full()
+    }
+}
diff --git a/src/iter/test.rs b/src/iter/test.rs
index bc5106b..c72068d 100644
--- a/src/iter/test.rs
+++ b/src/iter/test.rs
@@ -117,13 +117,10 @@
     let r1 = (0_i32..32)
         .into_par_iter()
         .with_max_len(1)
-        .fold(
-            || vec![],
-            |mut v, e| {
-                v.push(e);
-                v
-            },
-        )
+        .fold(Vec::new, |mut v, e| {
+            v.push(e);
+            v
+        })
         .map(|v| vec![v])
         .reduce_with(|mut v_a, v_b| {
             v_a.extend(v_b);
@@ -394,7 +391,7 @@
 #[test]
 fn check_vec_indexed() {
     let a = vec![1, 2, 3];
-    is_indexed(a.clone().into_par_iter());
+    is_indexed(a.into_par_iter());
 }
 
 #[test]
@@ -471,6 +468,7 @@
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn check_cmp_short_circuit() {
     // We only use a single thread in order to make the short-circuit behavior deterministic.
     let pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
@@ -500,6 +498,7 @@
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn check_partial_cmp_short_circuit() {
     // We only use a single thread to make the short-circuit behavior deterministic.
     let pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
@@ -529,6 +528,7 @@
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn check_partial_cmp_nan_short_circuit() {
     // We only use a single thread to make the short-circuit behavior deterministic.
     let pool = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
@@ -1371,10 +1371,10 @@
     let counter = AtomicUsize::new(0);
     let value: Option<i32> = (0_i32..2048).into_par_iter().find_any(|&p| {
         counter.fetch_add(1, Ordering::SeqCst);
-        p >= 1024 && p < 1096
+        (1024..1096).contains(&p)
     });
     let q = value.unwrap();
-    assert!(q >= 1024 && q < 1096);
+    assert!((1024..1096).contains(&q));
     assert!(counter.load(Ordering::SeqCst) < 2048); // should not have visited every single one
 }
 
@@ -1892,7 +1892,7 @@
 
     // try an indexed iterator
     let left: E = Either::Left(v.clone().into_par_iter());
-    assert!(left.enumerate().eq(v.clone().into_par_iter().enumerate()));
+    assert!(left.enumerate().eq(v.into_par_iter().enumerate()));
 }
 
 #[test]
@@ -2063,7 +2063,7 @@
     assert_eq!(4, (0..8).into_par_iter().chunks(2).len());
     assert_eq!(3, (0..9).into_par_iter().chunks(3).len());
     assert_eq!(3, (0..8).into_par_iter().chunks(3).len());
-    assert_eq!(1, (&[1]).par_iter().chunks(3).len());
+    assert_eq!(1, [1].par_iter().chunks(3).len());
     assert_eq!(0, (0..0).into_par_iter().chunks(3).len());
 }
 
diff --git a/src/iter/try_fold.rs b/src/iter/try_fold.rs
index c1a57a4..6d1048d 100644
--- a/src/iter/try_fold.rs
+++ b/src/iter/try_fold.rs
@@ -1,15 +1,16 @@
 use super::plumbing::*;
-use super::*;
+use super::ParallelIterator;
+use super::Try;
 
-use super::private::Try;
 use std::fmt::{self, Debug};
 use std::marker::PhantomData;
+use std::ops::ControlFlow::{self, Break, Continue};
 
 impl<U, I, ID, F> TryFold<I, U, ID, F>
 where
     I: ParallelIterator,
-    F: Fn(U::Ok, I::Item) -> U + Sync + Send,
-    ID: Fn() -> U::Ok + Sync + Send,
+    F: Fn(U::Output, I::Item) -> U + Sync + Send,
+    ID: Fn() -> U::Output + Sync + Send,
     U: Try + Send,
 {
     pub(super) fn new(base: I, identity: ID, fold_op: F) -> Self {
@@ -45,8 +46,8 @@
 impl<U, I, ID, F> ParallelIterator for TryFold<I, U, ID, F>
 where
     I: ParallelIterator,
-    F: Fn(U::Ok, I::Item) -> U + Sync + Send,
-    ID: Fn() -> U::Ok + Sync + Send,
+    F: Fn(U::Output, I::Item) -> U + Sync + Send,
+    ID: Fn() -> U::Output + Sync + Send,
     U: Try + Send,
 {
     type Item = U;
@@ -75,8 +76,8 @@
 impl<'r, U, T, C, ID, F> Consumer<T> for TryFoldConsumer<'r, U, C, ID, F>
 where
     C: Consumer<U>,
-    F: Fn(U::Ok, T) -> U + Sync,
-    ID: Fn() -> U::Ok + Sync,
+    F: Fn(U::Output, T) -> U + Sync,
+    ID: Fn() -> U::Output + Sync,
     U: Try + Send,
 {
     type Folder = TryFoldFolder<'r, C::Folder, U, F>;
@@ -98,7 +99,7 @@
     fn into_folder(self) -> Self::Folder {
         TryFoldFolder {
             base: self.base.into_folder(),
-            result: Ok((self.identity)()),
+            control: Continue((self.identity)()),
             fold_op: self.fold_op,
         }
     }
@@ -111,8 +112,8 @@
 impl<'r, U, T, C, ID, F> UnindexedConsumer<T> for TryFoldConsumer<'r, U, C, ID, F>
 where
     C: UnindexedConsumer<U>,
-    F: Fn(U::Ok, T) -> U + Sync,
-    ID: Fn() -> U::Ok + Sync,
+    F: Fn(U::Output, T) -> U + Sync,
+    ID: Fn() -> U::Output + Sync,
     U: Try + Send,
 {
     fn split_off_left(&self) -> Self {
@@ -130,35 +131,38 @@
 struct TryFoldFolder<'r, C, U: Try, F> {
     base: C,
     fold_op: &'r F,
-    result: Result<U::Ok, U::Error>,
+    control: ControlFlow<U::Residual, U::Output>,
 }
 
 impl<'r, C, U, F, T> Folder<T> for TryFoldFolder<'r, C, U, F>
 where
     C: Folder<U>,
-    F: Fn(U::Ok, T) -> U + Sync,
+    F: Fn(U::Output, T) -> U + Sync,
     U: Try,
 {
     type Result = C::Result;
 
     fn consume(mut self, item: T) -> Self {
         let fold_op = self.fold_op;
-        if let Ok(acc) = self.result {
-            self.result = fold_op(acc, item).into_result();
+        if let Continue(acc) = self.control {
+            self.control = fold_op(acc, item).branch();
         }
         self
     }
 
     fn complete(self) -> C::Result {
-        let item = match self.result {
-            Ok(ok) => U::from_ok(ok),
-            Err(error) => U::from_error(error),
+        let item = match self.control {
+            Continue(c) => U::from_output(c),
+            Break(r) => U::from_residual(r),
         };
         self.base.consume(item).complete()
     }
 
     fn full(&self) -> bool {
-        self.result.is_err() || self.base.full()
+        match self.control {
+            Break(_) => true,
+            _ => self.base.full(),
+        }
     }
 }
 
@@ -167,11 +171,11 @@
 impl<U, I, F> TryFoldWith<I, U, F>
 where
     I: ParallelIterator,
-    F: Fn(U::Ok, I::Item) -> U + Sync,
+    F: Fn(U::Output, I::Item) -> U + Sync,
     U: Try + Send,
-    U::Ok: Clone + Send,
+    U::Output: Clone + Send,
 {
-    pub(super) fn new(base: I, item: U::Ok, fold_op: F) -> Self {
+    pub(super) fn new(base: I, item: U::Output, fold_op: F) -> Self {
         TryFoldWith {
             base,
             item,
@@ -189,13 +193,13 @@
 #[derive(Clone)]
 pub struct TryFoldWith<I, U: Try, F> {
     base: I,
-    item: U::Ok,
+    item: U::Output,
     fold_op: F,
 }
 
 impl<I: ParallelIterator + Debug, U: Try, F> Debug for TryFoldWith<I, U, F>
 where
-    U::Ok: Debug,
+    U::Output: Debug,
 {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("TryFoldWith")
@@ -208,9 +212,9 @@
 impl<U, I, F> ParallelIterator for TryFoldWith<I, U, F>
 where
     I: ParallelIterator,
-    F: Fn(U::Ok, I::Item) -> U + Sync + Send,
+    F: Fn(U::Output, I::Item) -> U + Sync + Send,
     U: Try + Send,
-    U::Ok: Clone + Send,
+    U::Output: Clone + Send,
 {
     type Item = U;
 
@@ -229,16 +233,16 @@
 
 struct TryFoldWithConsumer<'c, C, U: Try, F> {
     base: C,
-    item: U::Ok,
+    item: U::Output,
     fold_op: &'c F,
 }
 
 impl<'r, U, T, C, F> Consumer<T> for TryFoldWithConsumer<'r, C, U, F>
 where
     C: Consumer<U>,
-    F: Fn(U::Ok, T) -> U + Sync,
+    F: Fn(U::Output, T) -> U + Sync,
     U: Try + Send,
-    U::Ok: Clone + Send,
+    U::Output: Clone + Send,
 {
     type Folder = TryFoldFolder<'r, C::Folder, U, F>;
     type Reducer = C::Reducer;
@@ -263,7 +267,7 @@
     fn into_folder(self) -> Self::Folder {
         TryFoldFolder {
             base: self.base.into_folder(),
-            result: Ok(self.item),
+            control: Continue(self.item),
             fold_op: self.fold_op,
         }
     }
@@ -276,9 +280,9 @@
 impl<'r, U, T, C, F> UnindexedConsumer<T> for TryFoldWithConsumer<'r, C, U, F>
 where
     C: UnindexedConsumer<U>,
-    F: Fn(U::Ok, T) -> U + Sync,
+    F: Fn(U::Output, T) -> U + Sync,
     U: Try + Send,
-    U::Ok: Clone + Send,
+    U::Output: Clone + Send,
 {
     fn split_off_left(&self) -> Self {
         TryFoldWithConsumer {
diff --git a/src/iter/try_reduce.rs b/src/iter/try_reduce.rs
index 76b3850..35a724c 100644
--- a/src/iter/try_reduce.rs
+++ b/src/iter/try_reduce.rs
@@ -1,14 +1,15 @@
 use super::plumbing::*;
 use super::ParallelIterator;
+use super::Try;
 
-use super::private::Try;
+use std::ops::ControlFlow::{self, Break, Continue};
 use std::sync::atomic::{AtomicBool, Ordering};
 
 pub(super) fn try_reduce<PI, R, ID, T>(pi: PI, identity: ID, reduce_op: R) -> T
 where
     PI: ParallelIterator<Item = T>,
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
-    ID: Fn() -> T::Ok + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
+    ID: Fn() -> T::Output + Sync,
     T: Try + Send,
 {
     let full = AtomicBool::new(false);
@@ -36,8 +37,8 @@
 
 impl<'r, R, ID, T> Consumer<T> for TryReduceConsumer<'r, R, ID>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
-    ID: Fn() -> T::Ok + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
+    ID: Fn() -> T::Output + Sync,
     T: Try + Send,
 {
     type Folder = TryReduceFolder<'r, R, T>;
@@ -51,7 +52,7 @@
     fn into_folder(self) -> Self::Folder {
         TryReduceFolder {
             reduce_op: self.reduce_op,
-            result: Ok((self.identity)()),
+            control: Continue((self.identity)()),
             full: self.full,
         }
     }
@@ -63,8 +64,8 @@
 
 impl<'r, R, ID, T> UnindexedConsumer<T> for TryReduceConsumer<'r, R, ID>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
-    ID: Fn() -> T::Ok + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
+    ID: Fn() -> T::Output + Sync,
     T: Try + Send,
 {
     fn split_off_left(&self) -> Self {
@@ -78,52 +79,53 @@
 
 impl<'r, R, ID, T> Reducer<T> for TryReduceConsumer<'r, R, ID>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
     T: Try,
 {
     fn reduce(self, left: T, right: T) -> T {
-        match (left.into_result(), right.into_result()) {
-            (Ok(left), Ok(right)) => (self.reduce_op)(left, right),
-            (Err(e), _) | (_, Err(e)) => T::from_error(e),
+        match (left.branch(), right.branch()) {
+            (Continue(left), Continue(right)) => (self.reduce_op)(left, right),
+            (Break(r), _) | (_, Break(r)) => T::from_residual(r),
         }
     }
 }
 
 struct TryReduceFolder<'r, R, T: Try> {
     reduce_op: &'r R,
-    result: Result<T::Ok, T::Error>,
+    control: ControlFlow<T::Residual, T::Output>,
     full: &'r AtomicBool,
 }
 
 impl<'r, R, T> Folder<T> for TryReduceFolder<'r, R, T>
 where
-    R: Fn(T::Ok, T::Ok) -> T,
+    R: Fn(T::Output, T::Output) -> T,
     T: Try,
 {
     type Result = T;
 
     fn consume(mut self, item: T) -> Self {
         let reduce_op = self.reduce_op;
-        if let Ok(left) = self.result {
-            self.result = match item.into_result() {
-                Ok(right) => reduce_op(left, right).into_result(),
-                Err(error) => Err(error),
-            };
-        }
-        if self.result.is_err() {
-            self.full.store(true, Ordering::Relaxed)
+        self.control = match (self.control, item.branch()) {
+            (Continue(left), Continue(right)) => reduce_op(left, right).branch(),
+            (control @ Break(_), _) | (_, control @ Break(_)) => control,
+        };
+        if let Break(_) = self.control {
+            self.full.store(true, Ordering::Relaxed);
         }
         self
     }
 
     fn complete(self) -> T {
-        match self.result {
-            Ok(ok) => T::from_ok(ok),
-            Err(error) => T::from_error(error),
+        match self.control {
+            Continue(c) => T::from_output(c),
+            Break(r) => T::from_residual(r),
         }
     }
 
     fn full(&self) -> bool {
-        self.full.load(Ordering::Relaxed)
+        match self.control {
+            Break(_) => true,
+            _ => self.full.load(Ordering::Relaxed),
+        }
     }
 }
diff --git a/src/iter/try_reduce_with.rs b/src/iter/try_reduce_with.rs
index 6be3100..cd7c83e 100644
--- a/src/iter/try_reduce_with.rs
+++ b/src/iter/try_reduce_with.rs
@@ -1,13 +1,14 @@
 use super::plumbing::*;
 use super::ParallelIterator;
+use super::Try;
 
-use super::private::Try;
+use std::ops::ControlFlow::{self, Break, Continue};
 use std::sync::atomic::{AtomicBool, Ordering};
 
 pub(super) fn try_reduce_with<PI, R, T>(pi: PI, reduce_op: R) -> Option<T>
 where
     PI: ParallelIterator<Item = T>,
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
     T: Try + Send,
 {
     let full = AtomicBool::new(false);
@@ -33,7 +34,7 @@
 
 impl<'r, R, T> Consumer<T> for TryReduceWithConsumer<'r, R>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
     T: Try + Send,
 {
     type Folder = TryReduceWithFolder<'r, R, T>;
@@ -47,7 +48,7 @@
     fn into_folder(self) -> Self::Folder {
         TryReduceWithFolder {
             reduce_op: self.reduce_op,
-            opt_result: None,
+            opt_control: None,
             full: self.full,
         }
     }
@@ -59,7 +60,7 @@
 
 impl<'r, R, T> UnindexedConsumer<T> for TryReduceWithConsumer<'r, R>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
     T: Try + Send,
 {
     fn split_off_left(&self) -> Self {
@@ -73,62 +74,59 @@
 
 impl<'r, R, T> Reducer<Option<T>> for TryReduceWithConsumer<'r, R>
 where
-    R: Fn(T::Ok, T::Ok) -> T + Sync,
+    R: Fn(T::Output, T::Output) -> T + Sync,
     T: Try,
 {
     fn reduce(self, left: Option<T>, right: Option<T>) -> Option<T> {
         let reduce_op = self.reduce_op;
         match (left, right) {
-            (None, x) | (x, None) => x,
-            (Some(a), Some(b)) => match (a.into_result(), b.into_result()) {
-                (Ok(a), Ok(b)) => Some(reduce_op(a, b)),
-                (Err(e), _) | (_, Err(e)) => Some(T::from_error(e)),
+            (Some(left), Some(right)) => match (left.branch(), right.branch()) {
+                (Continue(left), Continue(right)) => Some(reduce_op(left, right)),
+                (Break(r), _) | (_, Break(r)) => Some(T::from_residual(r)),
             },
+            (None, x) | (x, None) => x,
         }
     }
 }
 
 struct TryReduceWithFolder<'r, R, T: Try> {
     reduce_op: &'r R,
-    opt_result: Option<Result<T::Ok, T::Error>>,
+    opt_control: Option<ControlFlow<T::Residual, T::Output>>,
     full: &'r AtomicBool,
 }
 
 impl<'r, R, T> Folder<T> for TryReduceWithFolder<'r, R, T>
 where
-    R: Fn(T::Ok, T::Ok) -> T,
+    R: Fn(T::Output, T::Output) -> T,
     T: Try,
 {
     type Result = Option<T>;
 
-    fn consume(self, item: T) -> Self {
+    fn consume(mut self, item: T) -> Self {
         let reduce_op = self.reduce_op;
-        let result = match self.opt_result {
-            None => item.into_result(),
-            Some(Ok(a)) => match item.into_result() {
-                Ok(b) => reduce_op(a, b).into_result(),
-                Err(e) => Err(e),
-            },
-            Some(Err(e)) => Err(e),
+        let control = match (self.opt_control, item.branch()) {
+            (Some(Continue(left)), Continue(right)) => reduce_op(left, right).branch(),
+            (Some(control @ Break(_)), _) | (_, control) => control,
         };
-        if result.is_err() {
+        if let Break(_) = control {
             self.full.store(true, Ordering::Relaxed)
         }
-        TryReduceWithFolder {
-            opt_result: Some(result),
-            ..self
-        }
+        self.opt_control = Some(control);
+        self
     }
 
     fn complete(self) -> Option<T> {
-        let result = self.opt_result?;
-        Some(match result {
-            Ok(ok) => T::from_ok(ok),
-            Err(error) => T::from_error(error),
-        })
+        match self.opt_control {
+            Some(Continue(c)) => Some(T::from_output(c)),
+            Some(Break(r)) => Some(T::from_residual(r)),
+            None => None,
+        }
     }
 
     fn full(&self) -> bool {
-        self.full.load(Ordering::Relaxed)
+        match self.opt_control {
+            Some(Break(_)) => true,
+            _ => self.full.load(Ordering::Relaxed),
+        }
     }
 }
diff --git a/src/iter/update.rs b/src/iter/update.rs
index 373a4d7..c693ac8 100644
--- a/src/iter/update.rs
+++ b/src/iter/update.rs
@@ -210,7 +210,7 @@
     F: Fn(&mut T) + Send + Sync,
 {
     fn split_off_left(&self) -> Self {
-        UpdateConsumer::new(self.base.split_off_left(), &self.update_op)
+        UpdateConsumer::new(self.base.split_off_left(), self.update_op)
     }
 
     fn to_reducer(&self) -> Self::Reducer {
diff --git a/src/lib.rs b/src/lib.rs
index b1e6fab..86f997b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,3 @@
-#![doc(html_root_url = "https://docs.rs/rayon/1.5")]
 #![deny(missing_debug_implementations)]
 #![deny(missing_docs)]
 #![deny(unreachable_pub)]
@@ -77,6 +76,11 @@
 //! [the `collections` from `std`]: https://doc.rust-lang.org/std/collections/index.html
 //! [`std`]: https://doc.rust-lang.org/std/
 //!
+//! # Targets without threading
+//!
+//! Rayon has limited support for targets without `std` threading implementations.
+//! See the [`rayon_core`] documentation for more information about its global fallback.
+//!
 //! # Other questions?
 //!
 //! See [the Rayon FAQ][faq].
@@ -114,8 +118,43 @@
 pub use rayon_core::ThreadPool;
 pub use rayon_core::ThreadPoolBuildError;
 pub use rayon_core::ThreadPoolBuilder;
-pub use rayon_core::{current_num_threads, current_thread_index};
+pub use rayon_core::{broadcast, spawn_broadcast, BroadcastContext};
+pub use rayon_core::{current_num_threads, current_thread_index, max_num_threads};
 pub use rayon_core::{in_place_scope, scope, Scope};
 pub use rayon_core::{in_place_scope_fifo, scope_fifo, ScopeFifo};
 pub use rayon_core::{join, join_context};
 pub use rayon_core::{spawn, spawn_fifo};
+pub use rayon_core::{yield_local, yield_now, Yield};
+
+/// We need to transmit raw pointers across threads. It is possible to do this
+/// without any unsafe code by converting pointers to usize or to AtomicPtr<T>
+/// then back to a raw pointer for use. We prefer this approach because code
+/// that uses this type is more explicit.
+///
+/// Unsafe code is still required to dereference the pointer, so this type is
+/// not unsound on its own, although it does partly lift the unconditional
+/// !Send and !Sync on raw pointers. As always, dereference with care.
+struct SendPtr<T>(*mut T);
+
+// SAFETY: !Send for raw pointers is not for safety, just as a lint
+unsafe impl<T: Send> Send for SendPtr<T> {}
+
+// SAFETY: !Sync for raw pointers is not for safety, just as a lint
+unsafe impl<T: Send> Sync for SendPtr<T> {}
+
+impl<T> SendPtr<T> {
+    // Helper to avoid disjoint captures of `send_ptr.0`
+    fn get(self) -> *mut T {
+        self.0
+    }
+}
+
+// Implement Clone without the T: Clone bound from the derive
+impl<T> Clone for SendPtr<T> {
+    fn clone(&self) -> Self {
+        Self(self.0)
+    }
+}
+
+// Implement Copy without the T: Copy bound from the derive
+impl<T> Copy for SendPtr<T> {}
diff --git a/src/math.rs b/src/math.rs
index 9de5889..3a630be 100644
--- a/src/math.rs
+++ b/src/math.rs
@@ -1,7 +1,7 @@
 use std::ops::{Bound, Range, RangeBounds};
 
 /// Divide `n` by `divisor`, and round up to the nearest integer
-/// if not evenly divisable.
+/// if not evenly divisible.
 #[inline]
 pub(super) fn div_round_up(n: usize, divisor: usize) -> usize {
     debug_assert!(divisor != 0, "Division by zero!");
diff --git a/src/slice/chunks.rs b/src/slice/chunks.rs
new file mode 100644
index 0000000..a3cc709
--- /dev/null
+++ b/src/slice/chunks.rs
@@ -0,0 +1,389 @@
+use crate::iter::plumbing::*;
+use crate::iter::*;
+use crate::math::div_round_up;
+use std::cmp;
+
+/// Parallel iterator over immutable non-overlapping chunks of a slice
+#[derive(Debug)]
+pub struct Chunks<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: Sync> Chunks<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data [T]) -> Self {
+        Self { chunk_size, slice }
+    }
+}
+
+impl<'data, T: Sync> Clone for Chunks<'data, T> {
+    fn clone(&self) -> Self {
+        Chunks { ..*self }
+    }
+}
+
+impl<'data, T: Sync + 'data> ParallelIterator for Chunks<'data, T> {
+    type Item = &'data [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Sync + 'data> IndexedParallelIterator for Chunks<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        div_round_up(self.slice.len(), self.chunk_size)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(ChunksProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct ChunksProducer<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: 'data + Sync> Producer for ChunksProducer<'data, T> {
+    type Item = &'data [T];
+    type IntoIter = ::std::slice::Chunks<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.chunks(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = cmp::min(index * self.chunk_size, self.slice.len());
+        let (left, right) = self.slice.split_at(elem_index);
+        (
+            ChunksProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+            ChunksProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over immutable non-overlapping chunks of a slice
+#[derive(Debug)]
+pub struct ChunksExact<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+    rem: &'data [T],
+}
+
+impl<'data, T: Sync> ChunksExact<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data [T]) -> Self {
+        let rem_len = slice.len() % chunk_size;
+        let len = slice.len() - rem_len;
+        let (slice, rem) = slice.split_at(len);
+        Self {
+            chunk_size,
+            slice,
+            rem,
+        }
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    pub fn remainder(&self) -> &'data [T] {
+        self.rem
+    }
+}
+
+impl<'data, T: Sync> Clone for ChunksExact<'data, T> {
+    fn clone(&self) -> Self {
+        ChunksExact { ..*self }
+    }
+}
+
+impl<'data, T: Sync + 'data> ParallelIterator for ChunksExact<'data, T> {
+    type Item = &'data [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Sync + 'data> IndexedParallelIterator for ChunksExact<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        self.slice.len() / self.chunk_size
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(ChunksExactProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct ChunksExactProducer<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: 'data + Sync> Producer for ChunksExactProducer<'data, T> {
+    type Item = &'data [T];
+    type IntoIter = ::std::slice::ChunksExact<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.chunks_exact(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = index * self.chunk_size;
+        let (left, right) = self.slice.split_at(elem_index);
+        (
+            ChunksExactProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+            ChunksExactProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over mutable non-overlapping chunks of a slice
+#[derive(Debug)]
+pub struct ChunksMut<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: Send> ChunksMut<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data mut [T]) -> Self {
+        Self { chunk_size, slice }
+    }
+}
+
+impl<'data, T: Send + 'data> ParallelIterator for ChunksMut<'data, T> {
+    type Item = &'data mut [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Send + 'data> IndexedParallelIterator for ChunksMut<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        div_round_up(self.slice.len(), self.chunk_size)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(ChunksMutProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct ChunksMutProducer<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: 'data + Send> Producer for ChunksMutProducer<'data, T> {
+    type Item = &'data mut [T];
+    type IntoIter = ::std::slice::ChunksMut<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.chunks_mut(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = cmp::min(index * self.chunk_size, self.slice.len());
+        let (left, right) = self.slice.split_at_mut(elem_index);
+        (
+            ChunksMutProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+            ChunksMutProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over mutable non-overlapping chunks of a slice
+#[derive(Debug)]
+pub struct ChunksExactMut<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+    rem: &'data mut [T],
+}
+
+impl<'data, T: Send> ChunksExactMut<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data mut [T]) -> Self {
+        let rem_len = slice.len() % chunk_size;
+        let len = slice.len() - rem_len;
+        let (slice, rem) = slice.split_at_mut(len);
+        Self {
+            chunk_size,
+            slice,
+            rem,
+        }
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    ///
+    /// Note that this has to consume `self` to return the original lifetime of
+    /// the data, which prevents this from actually being used as a parallel
+    /// iterator since that also consumes. This method is provided for parity
+    /// with `std::iter::ChunksExactMut`, but consider calling `remainder()` or
+    /// `take_remainder()` as alternatives.
+    pub fn into_remainder(self) -> &'data mut [T] {
+        self.rem
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    ///
+    /// Consider `take_remainder()` if you need access to the data with its
+    /// original lifetime, rather than borrowing through `&mut self` here.
+    pub fn remainder(&mut self) -> &mut [T] {
+        self.rem
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements. Subsequent calls will return an empty slice.
+    pub fn take_remainder(&mut self) -> &'data mut [T] {
+        std::mem::take(&mut self.rem)
+    }
+}
+
+impl<'data, T: Send + 'data> ParallelIterator for ChunksExactMut<'data, T> {
+    type Item = &'data mut [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Send + 'data> IndexedParallelIterator for ChunksExactMut<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        self.slice.len() / self.chunk_size
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(ChunksExactMutProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct ChunksExactMutProducer<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: 'data + Send> Producer for ChunksExactMutProducer<'data, T> {
+    type Item = &'data mut [T];
+    type IntoIter = ::std::slice::ChunksExactMut<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.chunks_exact_mut(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = index * self.chunk_size;
+        let (left, right) = self.slice.split_at_mut(elem_index);
+        (
+            ChunksExactMutProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+            ChunksExactMutProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+        )
+    }
+}
diff --git a/src/slice/mergesort.rs b/src/slice/mergesort.rs
index e9a5d43..fec309d 100644
--- a/src/slice/mergesort.rs
+++ b/src/slice/mergesort.rs
@@ -6,6 +6,7 @@
 
 use crate::iter::*;
 use crate::slice::ParallelSliceMut;
+use crate::SendPtr;
 use std::mem;
 use std::mem::size_of;
 use std::ptr;
@@ -24,7 +25,7 @@
 
 /// When dropped, copies from `src` into `dest` a sequence of length `len`.
 struct CopyOnDrop<T> {
-    src: *mut T,
+    src: *const T,
     dest: *mut T,
     len: usize,
 }
@@ -63,9 +64,7 @@
             //    performance than with the 2nd method.
             //
             // All methods were benchmarked, and the 3rd showed best results. So we chose that one.
-            let mut tmp = NoDrop {
-                value: Some(ptr::read(&v[0])),
-            };
+            let tmp = mem::ManuallyDrop::new(ptr::read(&v[0]));
 
             // Intermediate state of the insertion process is always tracked by `hole`, which
             // serves two purposes:
@@ -78,13 +77,13 @@
             // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
             // initially held exactly once.
             let mut hole = InsertionHole {
-                src: tmp.value.as_mut().unwrap(),
+                src: &*tmp,
                 dest: &mut v[1],
             };
             ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);
 
             for i in 2..v.len() {
-                if !is_less(&v[i], tmp.value.as_ref().unwrap()) {
+                if !is_less(&v[i], &*tmp) {
                     break;
                 }
                 ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
@@ -94,20 +93,9 @@
         }
     }
 
-    // Holds a value, but never drops it.
-    struct NoDrop<T> {
-        value: Option<T>,
-    }
-
-    impl<T> Drop for NoDrop<T> {
-        fn drop(&mut self) {
-            mem::forget(self.value.take());
-        }
-    }
-
     // When dropped, copies from `src` into `dest`.
     struct InsertionHole<T> {
-        src: *mut T,
+        src: *const T,
         dest: *mut T,
     }
 
@@ -217,8 +205,8 @@
     impl<T> Drop for MergeHole<T> {
         fn drop(&mut self) {
             // `T` is not a zero-sized type, so it's okay to divide by its size.
-            let len = (self.end as usize - self.start as usize) / size_of::<T>();
             unsafe {
+                let len = self.end.offset_from(self.start) as usize;
                 ptr::copy_nonoverlapping(self.start, self.dest, len);
             }
         }
@@ -284,7 +272,7 @@
 /// Otherwise, it sorts the slice into non-descending order.
 ///
 /// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail
-/// [here](https://svn.python.org/projects/python/trunk/Objects/listsort.txt).
+/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt).
 ///
 /// The algorithm identifies strictly descending and non-descending subsequences, which are called
 /// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed
@@ -294,7 +282,7 @@
 /// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len`
 /// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
 ///
-/// The invariants ensure that the total running time is `O(n log n)` worst-case.
+/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case.
 ///
 /// # Safety
 ///
@@ -497,12 +485,13 @@
         // get copied into `dest_left` and `dest_right``.
         mem::forget(s);
 
-        // Convert the pointers to `usize` because `*mut T` is not `Send`.
-        let dest_l = dest as usize;
-        let dest_r = dest.add(left_l.len() + right_l.len()) as usize;
+        // Wrap pointers in SendPtr so that they can be sent to another thread
+        // See the documentation of SendPtr for a full explanation
+        let dest_l = SendPtr(dest);
+        let dest_r = SendPtr(dest.add(left_l.len() + right_l.len()));
         rayon_core::join(
-            || par_merge(left_l, right_l, dest_l as *mut T, is_less),
-            || par_merge(left_r, right_r, dest_r as *mut T, is_less),
+            move || par_merge(left_l, right_l, dest_l.get(), is_less),
+            move || par_merge(left_r, right_r, dest_r.get(), is_less),
         );
     }
     // Finally, `s` gets dropped if we used sequential merge, thus copying the remaining elements
@@ -580,7 +569,7 @@
 
     // After recursive calls finish we'll have to merge chunks `(start, mid)` and `(mid, end)` from
     // `src` into `dest`. If the current invocation has to store the result into `buf`, we'll
-    // merge chunks from `v` into `buf`, and viceversa.
+    // merge chunks from `v` into `buf`, and vice versa.
     //
     // Recursive calls flip `into_buf` at each level of recursion. More concretely, `par_merge`
     // merges chunks from `buf` into `v` at the first level, from `v` into `buf` at the second
@@ -598,12 +587,13 @@
         len: end - start,
     };
 
-    // Convert the pointers to `usize` because `*mut T` is not `Send`.
-    let v = v as usize;
-    let buf = buf as usize;
+    // Wrap pointers in SendPtr so that they can be sent to another thread
+    // See the documentation of SendPtr for a full explanation
+    let v = SendPtr(v);
+    let buf = SendPtr(buf);
     rayon_core::join(
-        || recurse(v as *mut T, buf as *mut T, left, !into_buf, is_less),
-        || recurse(v as *mut T, buf as *mut T, right, !into_buf, is_less),
+        move || recurse(v.get(), buf.get(), left, !into_buf, is_less),
+        move || recurse(v.get(), buf.get(), right, !into_buf, is_less),
     );
 
     // Everything went all right - recursive calls didn't panic.
@@ -667,18 +657,20 @@
     // Split the slice into chunks and merge sort them in parallel.
     // However, descending chunks will not be sorted - they will be simply left intact.
     let mut iter = {
-        // Convert the pointer to `usize` because `*mut T` is not `Send`.
-        let buf = buf as usize;
+        // Wrap pointer in SendPtr so that it can be sent to another thread
+        // See the documentation of SendPtr for a full explanation
+        let buf = SendPtr(buf);
+        let is_less = &is_less;
 
         v.par_chunks_mut(CHUNK_LENGTH)
             .with_max_len(1)
             .enumerate()
-            .map(|(i, chunk)| {
+            .map(move |(i, chunk)| {
                 let l = CHUNK_LENGTH * i;
                 let r = l + chunk.len();
                 unsafe {
-                    let buf = (buf as *mut T).add(l);
-                    (l, r, mergesort(chunk, buf, &is_less))
+                    let buf = buf.get().add(l);
+                    (l, r, mergesort(chunk, buf, is_less))
                 }
             })
             .collect::<Vec<_>>()
@@ -739,7 +731,7 @@
         check(&[1, 2, 2, 2, 2, 3], &[]);
         check(&[], &[1, 2, 2, 2, 2, 3]);
 
-        let ref mut rng = thread_rng();
+        let rng = &mut thread_rng();
 
         for _ in 0..100 {
             let limit: u32 = rng.gen_range(1..21);
diff --git a/src/slice/mod.rs b/src/slice/mod.rs
index d47b0d3..dab56de 100644
--- a/src/slice/mod.rs
+++ b/src/slice/mod.rs
@@ -5,8 +5,10 @@
 //!
 //! [std::slice]: https://doc.rust-lang.org/stable/std/slice/
 
+mod chunks;
 mod mergesort;
 mod quicksort;
+mod rchunks;
 
 mod test;
 
@@ -18,8 +20,10 @@
 use std::cmp;
 use std::cmp::Ordering;
 use std::fmt::{self, Debug};
+use std::mem;
 
-use super::math::div_round_up;
+pub use self::chunks::{Chunks, ChunksExact, ChunksExactMut, ChunksMut};
+pub use self::rchunks::{RChunks, RChunksExact, RChunksExactMut, RChunksMut};
 
 /// Parallel extensions for slices.
 pub trait ParallelSlice<T: Sync> {
@@ -81,12 +85,10 @@
     /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_chunks(2).collect();
     /// assert_eq!(chunks, vec![&[1, 2][..], &[3, 4], &[5]]);
     /// ```
+    #[track_caller]
     fn par_chunks(&self, chunk_size: usize) -> Chunks<'_, T> {
         assert!(chunk_size != 0, "chunk_size must not be zero");
-        Chunks {
-            chunk_size,
-            slice: self.as_parallel_slice(),
-        }
+        Chunks::new(chunk_size, self.as_parallel_slice())
     }
 
     /// Returns a parallel iterator over `chunk_size` elements of
@@ -103,17 +105,50 @@
     /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_chunks_exact(2).collect();
     /// assert_eq!(chunks, vec![&[1, 2][..], &[3, 4]]);
     /// ```
+    #[track_caller]
     fn par_chunks_exact(&self, chunk_size: usize) -> ChunksExact<'_, T> {
         assert!(chunk_size != 0, "chunk_size must not be zero");
-        let slice = self.as_parallel_slice();
-        let rem = slice.len() % chunk_size;
-        let len = slice.len() - rem;
-        let (fst, snd) = slice.split_at(len);
-        ChunksExact {
-            chunk_size,
-            slice: fst,
-            rem: snd,
-        }
+        ChunksExact::new(chunk_size, self.as_parallel_slice())
+    }
+
+    /// Returns a parallel iterator over at most `chunk_size` elements of `self` at a time,
+    /// starting at the end. The chunks do not overlap.
+    ///
+    /// If the number of elements in the iterator is not divisible by
+    /// `chunk_size`, the last chunk may be shorter than `chunk_size`.  All
+    /// other chunks will have that exact length.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_rchunks(2).collect();
+    /// assert_eq!(chunks, vec![&[4, 5][..], &[2, 3], &[1]]);
+    /// ```
+    #[track_caller]
+    fn par_rchunks(&self, chunk_size: usize) -> RChunks<'_, T> {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        RChunks::new(chunk_size, self.as_parallel_slice())
+    }
+
+    /// Returns a parallel iterator over `chunk_size` elements of `self` at a time,
+    /// starting at the end. The chunks do not overlap.
+    ///
+    /// If `chunk_size` does not divide the length of the slice, then the
+    /// last up to `chunk_size-1` elements will be omitted and can be
+    /// retrieved from the remainder function of the iterator.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let chunks: Vec<_> = [1, 2, 3, 4, 5].par_rchunks_exact(2).collect();
+    /// assert_eq!(chunks, vec![&[4, 5][..], &[2, 3]]);
+    /// ```
+    #[track_caller]
+    fn par_rchunks_exact(&self, chunk_size: usize) -> RChunksExact<'_, T> {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        RChunksExact::new(chunk_size, self.as_parallel_slice())
     }
 }
 
@@ -168,12 +203,10 @@
     ///      .for_each(|slice| slice.reverse());
     /// assert_eq!(array, [2, 1, 4, 3, 5]);
     /// ```
+    #[track_caller]
     fn par_chunks_mut(&mut self, chunk_size: usize) -> ChunksMut<'_, T> {
         assert!(chunk_size != 0, "chunk_size must not be zero");
-        ChunksMut {
-            chunk_size,
-            slice: self.as_parallel_slice_mut(),
-        }
+        ChunksMut::new(chunk_size, self.as_parallel_slice_mut())
     }
 
     /// Returns a parallel iterator over `chunk_size` elements of
@@ -192,22 +225,59 @@
     ///      .for_each(|slice| slice.reverse());
     /// assert_eq!(array, [3, 2, 1, 4, 5]);
     /// ```
+    #[track_caller]
     fn par_chunks_exact_mut(&mut self, chunk_size: usize) -> ChunksExactMut<'_, T> {
         assert!(chunk_size != 0, "chunk_size must not be zero");
-        let slice = self.as_parallel_slice_mut();
-        let rem = slice.len() % chunk_size;
-        let len = slice.len() - rem;
-        let (fst, snd) = slice.split_at_mut(len);
-        ChunksExactMut {
-            chunk_size,
-            slice: fst,
-            rem: snd,
-        }
+        ChunksExactMut::new(chunk_size, self.as_parallel_slice_mut())
+    }
+
+    /// Returns a parallel iterator over at most `chunk_size` elements of `self` at a time,
+    /// starting at the end. The chunks are mutable and do not overlap.
+    ///
+    /// If the number of elements in the iterator is not divisible by
+    /// `chunk_size`, the last chunk may be shorter than `chunk_size`.  All
+    /// other chunks will have that exact length.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let mut array = [1, 2, 3, 4, 5];
+    /// array.par_rchunks_mut(2)
+    ///      .for_each(|slice| slice.reverse());
+    /// assert_eq!(array, [1, 3, 2, 5, 4]);
+    /// ```
+    #[track_caller]
+    fn par_rchunks_mut(&mut self, chunk_size: usize) -> RChunksMut<'_, T> {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        RChunksMut::new(chunk_size, self.as_parallel_slice_mut())
+    }
+
+    /// Returns a parallel iterator over `chunk_size` elements of `self` at a time,
+    /// starting at the end. The chunks are mutable and do not overlap.
+    ///
+    /// If `chunk_size` does not divide the length of the slice, then the
+    /// last up to `chunk_size-1` elements will be omitted and can be
+    /// retrieved from the remainder function of the iterator.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    /// let mut array = [1, 2, 3, 4, 5];
+    /// array.par_rchunks_exact_mut(3)
+    ///      .for_each(|slice| slice.reverse());
+    /// assert_eq!(array, [1, 2, 5, 4, 3]);
+    /// ```
+    #[track_caller]
+    fn par_rchunks_exact_mut(&mut self, chunk_size: usize) -> RChunksExactMut<'_, T> {
+        assert!(chunk_size != 0, "chunk_size must not be zero");
+        RChunksExactMut::new(chunk_size, self.as_parallel_slice_mut())
     }
 
     /// Sorts the slice in parallel.
     ///
-    /// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
     ///
     /// When applicable, unstable sorting is preferred because it is generally faster than stable
     /// sorting and it doesn't allocate auxiliary memory.
@@ -247,7 +317,25 @@
 
     /// Sorts the slice in parallel with a comparator function.
     ///
-    /// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// The comparator function must define a total ordering for the elements in the slice. If
+    /// the ordering is not total, the order of the elements is unspecified. An order is a
+    /// total order if it is (for all `a`, `b` and `c`):
+    ///
+    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
+    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
+    ///
+    /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use
+    /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`.
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
+    /// floats.par_sort_by(|a, b| a.partial_cmp(b).unwrap());
+    /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
+    /// ```
     ///
     /// When applicable, unstable sorting is preferred because it is generally faster than stable
     /// sorting and it doesn't allocate auxiliary memory.
@@ -292,7 +380,12 @@
 
     /// Sorts the slice in parallel with a key extraction function.
     ///
-    /// This sort is stable (i.e. does not reorder equal elements) and `O(n log n)` worst-case.
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*))
+    /// worst-case, where the key function is *O*(*m*).
+    ///
+    /// For expensive key functions (e.g. functions that are not simple property accesses or
+    /// basic operations), [`par_sort_by_cached_key`](#method.par_sort_by_cached_key) is likely to
+    /// be significantly faster, as it does not recompute element keys.
     ///
     /// When applicable, unstable sorting is preferred because it is generally faster than stable
     /// sorting and it doesn't allocate auxiliary memory.
@@ -323,27 +416,119 @@
     /// v.par_sort_by_key(|k| k.abs());
     /// assert_eq!(v, [1, 2, -3, 4, -5]);
     /// ```
-    fn par_sort_by_key<B, F>(&mut self, f: F)
+    fn par_sort_by_key<K, F>(&mut self, f: F)
     where
-        B: Ord,
-        F: Fn(&T) -> B + Sync,
+        K: Ord,
+        F: Fn(&T) -> K + Sync,
     {
         par_mergesort(self.as_parallel_slice_mut(), |a, b| f(a).lt(&f(b)));
     }
 
-    /// Sorts the slice in parallel, but may not preserve the order of equal elements.
+    /// Sorts the slice in parallel with a key extraction function.
     ///
-    /// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
-    /// and `O(n log n)` worst-case.
+    /// During sorting, the key function is called at most once per element, by using
+    /// temporary storage to remember the results of key evaluation.
+    /// The key function is called in parallel, so the order of calls is completely unspecified.
+    ///
+    /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*))
+    /// worst-case, where the key function is *O*(*m*).
+    ///
+    /// For simple key functions (e.g., functions that are property accesses or
+    /// basic operations), [`par_sort_by_key`](#method.par_sort_by_key) is likely to be
+    /// faster.
     ///
     /// # Current implementation
     ///
-    /// The current algorithm is based on Orson Peters' [pattern-defeating quicksort][pdqsort],
-    /// which is a quicksort variant designed to be very fast on certain kinds of patterns,
-    /// sometimes achieving linear time. It is randomized but deterministic, and falls back to
-    /// heapsort on degenerate inputs.
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
     ///
-    /// It is generally faster than stable sorting, except in a few special cases, e.g. when the
+    /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the
+    /// length of the slice.
+    ///
+    /// All quicksorts work in two stages: partitioning into two halves followed by recursive
+    /// calls. The partitioning phase is sequential, but the two recursive calls are performed in
+    /// parallel. Finally, after sorting the cached keys, the item positions are updated sequentially.
+    ///
+    /// [pdqsort]: https://github.com/orlp/pdqsort
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let mut v = [-5i32, 4, 32, -3, 2];
+    ///
+    /// v.par_sort_by_cached_key(|k| k.to_string());
+    /// assert!(v == [-3, -5, 2, 32, 4]);
+    /// ```
+    fn par_sort_by_cached_key<K, F>(&mut self, f: F)
+    where
+        F: Fn(&T) -> K + Sync,
+        K: Ord + Send,
+    {
+        let slice = self.as_parallel_slice_mut();
+        let len = slice.len();
+        if len < 2 {
+            return;
+        }
+
+        // Helper macro for indexing our vector by the smallest possible type, to reduce allocation.
+        macro_rules! sort_by_key {
+            ($t:ty) => {{
+                let mut indices: Vec<_> = slice
+                    .par_iter_mut()
+                    .enumerate()
+                    .map(|(i, x)| (f(&*x), i as $t))
+                    .collect();
+                // The elements of `indices` are unique, as they are indexed, so any sort will be
+                // stable with respect to the original slice. We use `sort_unstable` here because
+                // it requires less memory allocation.
+                indices.par_sort_unstable();
+                for i in 0..len {
+                    let mut index = indices[i].1;
+                    while (index as usize) < i {
+                        index = indices[index as usize].1;
+                    }
+                    indices[i].1 = index;
+                    slice.swap(i, index as usize);
+                }
+            }};
+        }
+
+        let sz_u8 = mem::size_of::<(K, u8)>();
+        let sz_u16 = mem::size_of::<(K, u16)>();
+        let sz_u32 = mem::size_of::<(K, u32)>();
+        let sz_usize = mem::size_of::<(K, usize)>();
+
+        if sz_u8 < sz_u16 && len <= (std::u8::MAX as usize) {
+            return sort_by_key!(u8);
+        }
+        if sz_u16 < sz_u32 && len <= (std::u16::MAX as usize) {
+            return sort_by_key!(u16);
+        }
+        if sz_u32 < sz_usize && len <= (std::u32::MAX as usize) {
+            return sort_by_key!(u32);
+        }
+        sort_by_key!(usize)
+    }
+
+    /// Sorts the slice in parallel, but might not preserve the order of equal elements.
+    ///
+    /// This sort is unstable (i.e., may reorder equal elements), in-place
+    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// # Current implementation
+    ///
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
+    ///
+    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
     /// slice consists of several concatenated sorted sequences.
     ///
     /// All quicksorts work in two stages: partitioning into two halves followed by recursive
@@ -369,20 +554,39 @@
         par_quicksort(self.as_parallel_slice_mut(), T::lt);
     }
 
-    /// Sorts the slice in parallel with a comparator function, but may not preserve the order of
+    /// Sorts the slice in parallel with a comparator function, but might not preserve the order of
     /// equal elements.
     ///
-    /// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
-    /// and `O(n log n)` worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place
+    /// (i.e., does not allocate), and *O*(*n* \* log(*n*)) worst-case.
+    ///
+    /// The comparator function must define a total ordering for the elements in the slice. If
+    /// the ordering is not total, the order of the elements is unspecified. An order is a
+    /// total order if it is (for all `a`, `b` and `c`):
+    ///
+    /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and
+    /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`.
+    ///
+    /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use
+    /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`.
+    ///
+    /// ```
+    /// use rayon::prelude::*;
+    ///
+    /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0];
+    /// floats.par_sort_unstable_by(|a, b| a.partial_cmp(b).unwrap());
+    /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]);
+    /// ```
     ///
     /// # Current implementation
     ///
-    /// The current algorithm is based on Orson Peters' [pattern-defeating quicksort][pdqsort],
-    /// which is a quicksort variant designed to be very fast on certain kinds of patterns,
-    /// sometimes achieving linear time. It is randomized but deterministic, and falls back to
-    /// heapsort on degenerate inputs.
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
     ///
-    /// It is generally faster than stable sorting, except in a few special cases, e.g. when the
+    /// It is typically faster than stable sorting, except in a few special cases, e.g., when the
     /// slice consists of several concatenated sorted sequences.
     ///
     /// All quicksorts work in two stages: partitioning into two halves followed by recursive
@@ -413,21 +617,24 @@
         });
     }
 
-    /// Sorts the slice in parallel with a key extraction function, but may not preserve the order
+    /// Sorts the slice in parallel with a key extraction function, but might not preserve the order
     /// of equal elements.
     ///
-    /// This sort is unstable (i.e. may reorder equal elements), in-place (i.e. does not allocate),
-    /// and `O(n log n)` worst-case.
+    /// This sort is unstable (i.e., may reorder equal elements), in-place
+    /// (i.e., does not allocate), and *O*(m \* *n* \* log(*n*)) worst-case,
+    /// where the key function is *O*(*m*).
     ///
     /// # Current implementation
     ///
-    /// The current algorithm is based on Orson Peters' [pattern-defeating quicksort][pdqsort],
-    /// which is a quicksort variant designed to be very fast on certain kinds of patterns,
-    /// sometimes achieving linear time. It is randomized but deterministic, and falls back to
-    /// heapsort on degenerate inputs.
+    /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters,
+    /// which combines the fast average case of randomized quicksort with the fast worst case of
+    /// heapsort, while achieving linear time on slices with certain patterns. It uses some
+    /// randomization to avoid degenerate cases, but with a fixed seed to always provide
+    /// deterministic behavior.
     ///
-    /// It is generally faster than stable sorting, except in a few special cases, e.g. when the
-    /// slice consists of several concatenated sorted sequences.
+    /// Due to its key calling strategy, `par_sort_unstable_by_key` is likely to be slower than
+    /// [`par_sort_by_cached_key`](#method.par_sort_by_cached_key) in cases where the key function
+    /// is expensive.
     ///
     /// All quicksorts work in two stages: partitioning into two halves followed by recursive
     /// calls. The partitioning phase is sequential, but the two recursive calls are performed in
@@ -445,10 +652,10 @@
     /// v.par_sort_unstable_by_key(|k| k.abs());
     /// assert_eq!(v, [1, 2, -3, 4, -5]);
     /// ```
-    fn par_sort_unstable_by_key<B, F>(&mut self, f: F)
+    fn par_sort_unstable_by_key<K, F>(&mut self, f: F)
     where
-        B: Ord,
-        F: Fn(&T) -> B + Sync,
+        K: Ord,
+        F: Fn(&T) -> K + Sync,
     {
         par_quicksort(self.as_parallel_slice_mut(), |a, b| f(a).lt(&f(b)));
     }
@@ -544,176 +751,6 @@
     }
 }
 
-/// Parallel iterator over immutable non-overlapping chunks of a slice
-#[derive(Debug)]
-pub struct Chunks<'data, T: Sync> {
-    chunk_size: usize,
-    slice: &'data [T],
-}
-
-impl<'data, T: Sync> Clone for Chunks<'data, T> {
-    fn clone(&self) -> Self {
-        Chunks { ..*self }
-    }
-}
-
-impl<'data, T: Sync + 'data> ParallelIterator for Chunks<'data, T> {
-    type Item = &'data [T];
-
-    fn drive_unindexed<C>(self, consumer: C) -> C::Result
-    where
-        C: UnindexedConsumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn opt_len(&self) -> Option<usize> {
-        Some(self.len())
-    }
-}
-
-impl<'data, T: Sync + 'data> IndexedParallelIterator for Chunks<'data, T> {
-    fn drive<C>(self, consumer: C) -> C::Result
-    where
-        C: Consumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn len(&self) -> usize {
-        div_round_up(self.slice.len(), self.chunk_size)
-    }
-
-    fn with_producer<CB>(self, callback: CB) -> CB::Output
-    where
-        CB: ProducerCallback<Self::Item>,
-    {
-        callback.callback(ChunksProducer {
-            chunk_size: self.chunk_size,
-            slice: self.slice,
-        })
-    }
-}
-
-struct ChunksProducer<'data, T: Sync> {
-    chunk_size: usize,
-    slice: &'data [T],
-}
-
-impl<'data, T: 'data + Sync> Producer for ChunksProducer<'data, T> {
-    type Item = &'data [T];
-    type IntoIter = ::std::slice::Chunks<'data, T>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.slice.chunks(self.chunk_size)
-    }
-
-    fn split_at(self, index: usize) -> (Self, Self) {
-        let elem_index = cmp::min(index * self.chunk_size, self.slice.len());
-        let (left, right) = self.slice.split_at(elem_index);
-        (
-            ChunksProducer {
-                chunk_size: self.chunk_size,
-                slice: left,
-            },
-            ChunksProducer {
-                chunk_size: self.chunk_size,
-                slice: right,
-            },
-        )
-    }
-}
-
-/// Parallel iterator over immutable non-overlapping chunks of a slice
-#[derive(Debug)]
-pub struct ChunksExact<'data, T: Sync> {
-    chunk_size: usize,
-    slice: &'data [T],
-    rem: &'data [T],
-}
-
-impl<'data, T: Sync> ChunksExact<'data, T> {
-    /// Return the remainder of the original slice that is not going to be
-    /// returned by the iterator. The returned slice has at most `chunk_size-1`
-    /// elements.
-    pub fn remainder(&self) -> &'data [T] {
-        self.rem
-    }
-}
-
-impl<'data, T: Sync> Clone for ChunksExact<'data, T> {
-    fn clone(&self) -> Self {
-        ChunksExact { ..*self }
-    }
-}
-
-impl<'data, T: Sync + 'data> ParallelIterator for ChunksExact<'data, T> {
-    type Item = &'data [T];
-
-    fn drive_unindexed<C>(self, consumer: C) -> C::Result
-    where
-        C: UnindexedConsumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn opt_len(&self) -> Option<usize> {
-        Some(self.len())
-    }
-}
-
-impl<'data, T: Sync + 'data> IndexedParallelIterator for ChunksExact<'data, T> {
-    fn drive<C>(self, consumer: C) -> C::Result
-    where
-        C: Consumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn len(&self) -> usize {
-        self.slice.len() / self.chunk_size
-    }
-
-    fn with_producer<CB>(self, callback: CB) -> CB::Output
-    where
-        CB: ProducerCallback<Self::Item>,
-    {
-        callback.callback(ChunksExactProducer {
-            chunk_size: self.chunk_size,
-            slice: self.slice,
-        })
-    }
-}
-
-struct ChunksExactProducer<'data, T: Sync> {
-    chunk_size: usize,
-    slice: &'data [T],
-}
-
-impl<'data, T: 'data + Sync> Producer for ChunksExactProducer<'data, T> {
-    type Item = &'data [T];
-    type IntoIter = ::std::slice::ChunksExact<'data, T>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.slice.chunks_exact(self.chunk_size)
-    }
-
-    fn split_at(self, index: usize) -> (Self, Self) {
-        let elem_index = index * self.chunk_size;
-        let (left, right) = self.slice.split_at(elem_index);
-        (
-            ChunksExactProducer {
-                chunk_size: self.chunk_size,
-                slice: left,
-            },
-            ChunksExactProducer {
-                chunk_size: self.chunk_size,
-                slice: right,
-            },
-        )
-    }
-}
-
 /// Parallel iterator over immutable overlapping windows of a slice
 #[derive(Debug)]
 pub struct Windows<'data, T: Sync> {
@@ -858,187 +895,6 @@
     }
 }
 
-/// Parallel iterator over mutable non-overlapping chunks of a slice
-#[derive(Debug)]
-pub struct ChunksMut<'data, T: Send> {
-    chunk_size: usize,
-    slice: &'data mut [T],
-}
-
-impl<'data, T: Send + 'data> ParallelIterator for ChunksMut<'data, T> {
-    type Item = &'data mut [T];
-
-    fn drive_unindexed<C>(self, consumer: C) -> C::Result
-    where
-        C: UnindexedConsumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn opt_len(&self) -> Option<usize> {
-        Some(self.len())
-    }
-}
-
-impl<'data, T: Send + 'data> IndexedParallelIterator for ChunksMut<'data, T> {
-    fn drive<C>(self, consumer: C) -> C::Result
-    where
-        C: Consumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn len(&self) -> usize {
-        div_round_up(self.slice.len(), self.chunk_size)
-    }
-
-    fn with_producer<CB>(self, callback: CB) -> CB::Output
-    where
-        CB: ProducerCallback<Self::Item>,
-    {
-        callback.callback(ChunksMutProducer {
-            chunk_size: self.chunk_size,
-            slice: self.slice,
-        })
-    }
-}
-
-struct ChunksMutProducer<'data, T: Send> {
-    chunk_size: usize,
-    slice: &'data mut [T],
-}
-
-impl<'data, T: 'data + Send> Producer for ChunksMutProducer<'data, T> {
-    type Item = &'data mut [T];
-    type IntoIter = ::std::slice::ChunksMut<'data, T>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.slice.chunks_mut(self.chunk_size)
-    }
-
-    fn split_at(self, index: usize) -> (Self, Self) {
-        let elem_index = cmp::min(index * self.chunk_size, self.slice.len());
-        let (left, right) = self.slice.split_at_mut(elem_index);
-        (
-            ChunksMutProducer {
-                chunk_size: self.chunk_size,
-                slice: left,
-            },
-            ChunksMutProducer {
-                chunk_size: self.chunk_size,
-                slice: right,
-            },
-        )
-    }
-}
-
-/// Parallel iterator over mutable non-overlapping chunks of a slice
-#[derive(Debug)]
-pub struct ChunksExactMut<'data, T: Send> {
-    chunk_size: usize,
-    slice: &'data mut [T],
-    rem: &'data mut [T],
-}
-
-impl<'data, T: Send> ChunksExactMut<'data, T> {
-    /// Return the remainder of the original slice that is not going to be
-    /// returned by the iterator. The returned slice has at most `chunk_size-1`
-    /// elements.
-    ///
-    /// Note that this has to consume `self` to return the original lifetime of
-    /// the data, which prevents this from actually being used as a parallel
-    /// iterator since that also consumes. This method is provided for parity
-    /// with `std::iter::ChunksExactMut`, but consider calling `remainder()` or
-    /// `take_remainder()` as alternatives.
-    pub fn into_remainder(self) -> &'data mut [T] {
-        self.rem
-    }
-
-    /// Return the remainder of the original slice that is not going to be
-    /// returned by the iterator. The returned slice has at most `chunk_size-1`
-    /// elements.
-    ///
-    /// Consider `take_remainder()` if you need access to the data with its
-    /// original lifetime, rather than borrowing through `&mut self` here.
-    pub fn remainder(&mut self) -> &mut [T] {
-        self.rem
-    }
-
-    /// Return the remainder of the original slice that is not going to be
-    /// returned by the iterator. The returned slice has at most `chunk_size-1`
-    /// elements. Subsequent calls will return an empty slice.
-    pub fn take_remainder(&mut self) -> &'data mut [T] {
-        std::mem::replace(&mut self.rem, &mut [])
-    }
-}
-
-impl<'data, T: Send + 'data> ParallelIterator for ChunksExactMut<'data, T> {
-    type Item = &'data mut [T];
-
-    fn drive_unindexed<C>(self, consumer: C) -> C::Result
-    where
-        C: UnindexedConsumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn opt_len(&self) -> Option<usize> {
-        Some(self.len())
-    }
-}
-
-impl<'data, T: Send + 'data> IndexedParallelIterator for ChunksExactMut<'data, T> {
-    fn drive<C>(self, consumer: C) -> C::Result
-    where
-        C: Consumer<Self::Item>,
-    {
-        bridge(self, consumer)
-    }
-
-    fn len(&self) -> usize {
-        self.slice.len() / self.chunk_size
-    }
-
-    fn with_producer<CB>(self, callback: CB) -> CB::Output
-    where
-        CB: ProducerCallback<Self::Item>,
-    {
-        callback.callback(ChunksExactMutProducer {
-            chunk_size: self.chunk_size,
-            slice: self.slice,
-        })
-    }
-}
-
-struct ChunksExactMutProducer<'data, T: Send> {
-    chunk_size: usize,
-    slice: &'data mut [T],
-}
-
-impl<'data, T: 'data + Send> Producer for ChunksExactMutProducer<'data, T> {
-    type Item = &'data mut [T];
-    type IntoIter = ::std::slice::ChunksExactMut<'data, T>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.slice.chunks_exact_mut(self.chunk_size)
-    }
-
-    fn split_at(self, index: usize) -> (Self, Self) {
-        let elem_index = index * self.chunk_size;
-        let (left, right) = self.slice.split_at_mut(elem_index);
-        (
-            ChunksExactMutProducer {
-                chunk_size: self.chunk_size,
-                slice: left,
-            },
-            ChunksExactMutProducer {
-                chunk_size: self.chunk_size,
-                slice: right,
-            },
-        )
-    }
-}
-
 /// Parallel iterator over slices separated by a predicate
 pub struct Split<'data, T, P> {
     slice: &'data [T],
diff --git a/src/slice/quicksort.rs b/src/slice/quicksort.rs
index 17c6f33..d71079e 100644
--- a/src/slice/quicksort.rs
+++ b/src/slice/quicksort.rs
@@ -5,46 +5,20 @@
 //! `rayon_core::join`.
 
 use std::cmp;
-use std::mem;
+use std::mem::{self, MaybeUninit};
 use std::ptr;
 
-/// When dropped, takes the value out of `Option` and writes it into `dest`.
-///
-/// This allows us to safely read the pivot into a stack-allocated variable for efficiency, and
-/// write it back into the slice after partitioning. This way we ensure that the write happens
-/// even if `is_less` panics in the meantime.
-struct WriteOnDrop<T> {
-    value: Option<T>,
-    dest: *mut T,
-}
-
-impl<T> Drop for WriteOnDrop<T> {
-    fn drop(&mut self) {
-        unsafe {
-            ptr::write(self.dest, self.value.take().unwrap());
-        }
-    }
-}
-
-/// Holds a value, but never drops it.
-struct NoDrop<T> {
-    value: Option<T>,
-}
-
-impl<T> Drop for NoDrop<T> {
-    fn drop(&mut self) {
-        mem::forget(self.value.take());
-    }
-}
-
 /// When dropped, copies from `src` into `dest`.
 struct CopyOnDrop<T> {
-    src: *mut T,
+    src: *const T,
     dest: *mut T,
 }
 
 impl<T> Drop for CopyOnDrop<T> {
     fn drop(&mut self) {
+        // SAFETY:  This is a helper class.
+        //          Please refer to its usage for correctness.
+        //          Namely, one must be sure that `src` and `dst` does not overlap as required by `ptr::copy_nonoverlapping`.
         unsafe {
             ptr::copy_nonoverlapping(self.src, self.dest, 1);
         }
@@ -57,29 +31,43 @@
     F: Fn(&T, &T) -> bool,
 {
     let len = v.len();
+    // SAFETY: The unsafe operations below involves indexing without a bounds check (by offsetting a
+    // pointer) and copying memory (`ptr::copy_nonoverlapping`).
+    //
+    // a. Indexing:
+    //  1. We checked the size of the array to >=2.
+    //  2. All the indexing that we will do is always between {0 <= index < len} at most.
+    //
+    // b. Memory copying
+    //  1. We are obtaining pointers to references which are guaranteed to be valid.
+    //  2. They cannot overlap because we obtain pointers to difference indices of the slice.
+    //     Namely, `i` and `i-1`.
+    //  3. If the slice is properly aligned, the elements are properly aligned.
+    //     It is the caller's responsibility to make sure the slice is properly aligned.
+    //
+    // See comments below for further detail.
     unsafe {
         // If the first two elements are out-of-order...
         if len >= 2 && is_less(v.get_unchecked(1), v.get_unchecked(0)) {
             // Read the first element into a stack-allocated variable. If a following comparison
             // operation panics, `hole` will get dropped and automatically write the element back
             // into the slice.
-            let mut tmp = NoDrop {
-                value: Some(ptr::read(v.get_unchecked(0))),
-            };
+            let tmp = mem::ManuallyDrop::new(ptr::read(v.get_unchecked(0)));
+            let v = v.as_mut_ptr();
             let mut hole = CopyOnDrop {
-                src: tmp.value.as_mut().unwrap(),
-                dest: v.get_unchecked_mut(1),
+                src: &*tmp,
+                dest: v.add(1),
             };
-            ptr::copy_nonoverlapping(v.get_unchecked(1), v.get_unchecked_mut(0), 1);
+            ptr::copy_nonoverlapping(v.add(1), v.add(0), 1);
 
             for i in 2..len {
-                if !is_less(v.get_unchecked(i), tmp.value.as_ref().unwrap()) {
+                if !is_less(&*v.add(i), &*tmp) {
                     break;
                 }
 
                 // Move `i`-th element one place to the left, thus shifting the hole to the right.
-                ptr::copy_nonoverlapping(v.get_unchecked(i), v.get_unchecked_mut(i - 1), 1);
-                hole.dest = v.get_unchecked_mut(i);
+                ptr::copy_nonoverlapping(v.add(i), v.add(i - 1), 1);
+                hole.dest = v.add(i);
             }
             // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
         }
@@ -92,29 +80,43 @@
     F: Fn(&T, &T) -> bool,
 {
     let len = v.len();
+    // SAFETY: The unsafe operations below involves indexing without a bound check (by offsetting a
+    // pointer) and copying memory (`ptr::copy_nonoverlapping`).
+    //
+    // a. Indexing:
+    //  1. We checked the size of the array to >= 2.
+    //  2. All the indexing that we will do is always between `0 <= index < len-1` at most.
+    //
+    // b. Memory copying
+    //  1. We are obtaining pointers to references which are guaranteed to be valid.
+    //  2. They cannot overlap because we obtain pointers to difference indices of the slice.
+    //     Namely, `i` and `i+1`.
+    //  3. If the slice is properly aligned, the elements are properly aligned.
+    //     It is the caller's responsibility to make sure the slice is properly aligned.
+    //
+    // See comments below for further detail.
     unsafe {
         // If the last two elements are out-of-order...
         if len >= 2 && is_less(v.get_unchecked(len - 1), v.get_unchecked(len - 2)) {
             // Read the last element into a stack-allocated variable. If a following comparison
             // operation panics, `hole` will get dropped and automatically write the element back
             // into the slice.
-            let mut tmp = NoDrop {
-                value: Some(ptr::read(v.get_unchecked(len - 1))),
-            };
+            let tmp = mem::ManuallyDrop::new(ptr::read(v.get_unchecked(len - 1)));
+            let v = v.as_mut_ptr();
             let mut hole = CopyOnDrop {
-                src: tmp.value.as_mut().unwrap(),
-                dest: v.get_unchecked_mut(len - 2),
+                src: &*tmp,
+                dest: v.add(len - 2),
             };
-            ptr::copy_nonoverlapping(v.get_unchecked(len - 2), v.get_unchecked_mut(len - 1), 1);
+            ptr::copy_nonoverlapping(v.add(len - 2), v.add(len - 1), 1);
 
             for i in (0..len - 2).rev() {
-                if !is_less(&tmp.value.as_ref().unwrap(), v.get_unchecked(i)) {
+                if !is_less(&*tmp, &*v.add(i)) {
                     break;
                 }
 
                 // Move `i`-th element one place to the right, thus shifting the hole to the left.
-                ptr::copy_nonoverlapping(v.get_unchecked(i), v.get_unchecked_mut(i + 1), 1);
-                hole.dest = v.get_unchecked_mut(i);
+                ptr::copy_nonoverlapping(v.add(i), v.add(i + 1), 1);
+                hole.dest = v.add(i);
             }
             // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`.
         }
@@ -123,7 +125,7 @@
 
 /// Partially sorts a slice by shifting several out-of-order elements around.
 ///
-/// Returns `true` if the slice is sorted at the end. This function is `O(n)` worst-case.
+/// Returns `true` if the slice is sorted at the end. This function is *O*(*n*) worst-case.
 #[cold]
 fn partial_insertion_sort<T, F>(v: &mut [T], is_less: &F) -> bool
 where
@@ -138,6 +140,8 @@
     let mut i = 1;
 
     for _ in 0..MAX_STEPS {
+        // SAFETY: We already explicitly did the bound checking with `i < len`.
+        // All our subsequent indexing is only in the range `0 <= index < len`
         unsafe {
             // Find the next pair of adjacent out-of-order elements.
             while i < len && !is_less(v.get_unchecked(i), v.get_unchecked(i - 1)) {
@@ -168,17 +172,17 @@
     false
 }
 
-/// Sorts a slice using insertion sort, which is `O(n^2)` worst-case.
+/// Sorts a slice using insertion sort, which is *O*(*n*^2) worst-case.
 fn insertion_sort<T, F>(v: &mut [T], is_less: &F)
 where
     F: Fn(&T, &T) -> bool,
 {
     for i in 1..v.len() {
-        shift_tail(&mut v[..=i], is_less);
+        shift_tail(&mut v[..i + 1], is_less);
     }
 }
 
-/// Sorts `v` using heapsort, which guarantees `O(n log n)` worst-case.
+/// Sorts `v` using heapsort, which guarantees *O*(*n* \* log(*n*)) worst-case.
 #[cold]
 fn heapsort<T, F>(v: &mut [T], is_less: &F)
 where
@@ -187,25 +191,25 @@
     // This binary heap respects the invariant `parent >= child`.
     let sift_down = |v: &mut [T], mut node| {
         loop {
-            // Children of `node`:
-            let left = 2 * node + 1;
-            let right = 2 * node + 2;
+            // Children of `node`.
+            let mut child = 2 * node + 1;
+            if child >= v.len() {
+                break;
+            }
 
             // Choose the greater child.
-            let greater = if right < v.len() && is_less(&v[left], &v[right]) {
-                right
-            } else {
-                left
-            };
+            if child + 1 < v.len() && is_less(&v[child], &v[child + 1]) {
+                child += 1;
+            }
 
             // Stop if the invariant holds at `node`.
-            if greater >= v.len() || !is_less(&v[node], &v[greater]) {
+            if !is_less(&v[node], &v[child]) {
                 break;
             }
 
             // Swap `node` with the greater child, move one step down, and continue sifting.
-            v.swap(node, greater);
-            node = greater;
+            v.swap(node, child);
+            node = child;
         }
     };
 
@@ -250,23 +254,30 @@
     // 3. `end` - End pointer into the `offsets` array.
     // 4. `offsets - Indices of out-of-order elements within the block.
 
-    // The current block on the left side (from `l` to `l.offset(block_l)`).
+    // The current block on the left side (from `l` to `l.add(block_l)`).
     let mut l = v.as_mut_ptr();
     let mut block_l = BLOCK;
     let mut start_l = ptr::null_mut();
     let mut end_l = ptr::null_mut();
-    let mut offsets_l = [0u8; BLOCK];
+    let mut offsets_l = [MaybeUninit::<u8>::uninit(); BLOCK];
 
-    // The current block on the right side (from `r.offset(-block_r)` to `r`).
+    // The current block on the right side (from `r.sub(block_r)` to `r`).
+    // SAFETY: The documentation for .add() specifically mention that `vec.as_ptr().add(vec.len())` is always safe`
     let mut r = unsafe { l.add(v.len()) };
     let mut block_r = BLOCK;
     let mut start_r = ptr::null_mut();
     let mut end_r = ptr::null_mut();
-    let mut offsets_r = [0u8; BLOCK];
+    let mut offsets_r = [MaybeUninit::<u8>::uninit(); BLOCK];
+
+    // FIXME: When we get VLAs, try creating one array of length `min(v.len(), 2 * BLOCK)` rather
+    // than two fixed-size arrays of length `BLOCK`. VLAs might be more cache-efficient.
 
     // Returns the number of elements between pointers `l` (inclusive) and `r` (exclusive).
     fn width<T>(l: *mut T, r: *mut T) -> usize {
         assert!(mem::size_of::<T>() > 0);
+        // FIXME: this should *likely* use `offset_from`, but more
+        // investigation is needed (including running tests in miri).
+        // TODO unstable: (r.addr() - l.addr()) / mem::size_of::<T>()
         (r as usize - l as usize) / mem::size_of::<T>()
     }
 
@@ -289,6 +300,9 @@
             } else if start_r < end_r {
                 block_l = rem;
             } else {
+                // There were the same number of elements to switch on both blocks during the last
+                // iteration, so there are no remaining elements on either block. Cover the remaining
+                // items with roughly equally-sized blocks.
                 block_l = rem / 2;
                 block_r = rem - block_l;
             }
@@ -298,11 +312,22 @@
 
         if start_l == end_l {
             // Trace `block_l` elements from the left side.
-            start_l = offsets_l.as_mut_ptr();
-            end_l = offsets_l.as_mut_ptr();
+            // TODO unstable: start_l = MaybeUninit::slice_as_mut_ptr(&mut offsets_l);
+            start_l = offsets_l.as_mut_ptr() as *mut u8;
+            end_l = start_l;
             let mut elem = l;
 
             for i in 0..block_l {
+                // SAFETY: The unsafety operations below involve the usage of the `offset`.
+                //         According to the conditions required by the function, we satisfy them because:
+                //         1. `offsets_l` is stack-allocated, and thus considered separate allocated object.
+                //         2. The function `is_less` returns a `bool`.
+                //            Casting a `bool` will never overflow `isize`.
+                //         3. We have guaranteed that `block_l` will be `<= BLOCK`.
+                //            Plus, `end_l` was initially set to the begin pointer of `offsets_` which was declared on the stack.
+                //            Thus, we know that even in the worst case (all invocations of `is_less` returns false) we will only be at most 1 byte pass the end.
+                //        Another unsafety operation here is dereferencing `elem`.
+                //        However, `elem` was initially the begin pointer to the slice which is always valid.
                 unsafe {
                     // Branchless comparison.
                     *end_l = i as u8;
@@ -314,11 +339,23 @@
 
         if start_r == end_r {
             // Trace `block_r` elements from the right side.
-            start_r = offsets_r.as_mut_ptr();
-            end_r = offsets_r.as_mut_ptr();
+            // TODO unstable: start_r = MaybeUninit::slice_as_mut_ptr(&mut offsets_r);
+            start_r = offsets_r.as_mut_ptr() as *mut u8;
+            end_r = start_r;
             let mut elem = r;
 
             for i in 0..block_r {
+                // SAFETY: The unsafety operations below involve the usage of the `offset`.
+                //         According to the conditions required by the function, we satisfy them because:
+                //         1. `offsets_r` is stack-allocated, and thus considered separate allocated object.
+                //         2. The function `is_less` returns a `bool`.
+                //            Casting a `bool` will never overflow `isize`.
+                //         3. We have guaranteed that `block_r` will be `<= BLOCK`.
+                //            Plus, `end_r` was initially set to the begin pointer of `offsets_` which was declared on the stack.
+                //            Thus, we know that even in the worst case (all invocations of `is_less` returns true) we will only be at most 1 byte pass the end.
+                //        Another unsafety operation here is dereferencing `elem`.
+                //        However, `elem` was initially `1 * sizeof(T)` past the end and we decrement it by `1 * sizeof(T)` before accessing it.
+                //        Plus, `block_r` was asserted to be less than `BLOCK` and `elem` will therefore at most be pointing to the beginning of the slice.
                 unsafe {
                     // Branchless comparison.
                     elem = elem.offset(-1);
@@ -346,6 +383,22 @@
             // Instead of swapping one pair at the time, it is more efficient to perform a cyclic
             // permutation. This is not strictly equivalent to swapping, but produces a similar
             // result using fewer memory operations.
+
+            // SAFETY: The use of `ptr::read` is valid because there is at least one element in
+            // both `offsets_l` and `offsets_r`, so `left!` is a valid pointer to read from.
+            //
+            // The uses of `left!` involve calls to `offset` on `l`, which points to the
+            // beginning of `v`. All the offsets pointed-to by `start_l` are at most `block_l`, so
+            // these `offset` calls are safe as all reads are within the block. The same argument
+            // applies for the uses of `right!`.
+            //
+            // The calls to `start_l.offset` are valid because there are at most `count-1` of them,
+            // plus the final one at the end of the unsafe block, where `count` is the minimum number
+            // of collected offsets in `offsets_l` and `offsets_r`, so there is no risk of there not
+            // being enough elements. The same reasoning applies to the calls to `start_r.offset`.
+            //
+            // The calls to `copy_nonoverlapping` are safe because `left!` and `right!` are guaranteed
+            // not to overlap, and are valid because of the reasoning above.
             unsafe {
                 let tmp = ptr::read(left!());
                 ptr::copy_nonoverlapping(right!(), left!(), 1);
@@ -366,12 +419,22 @@
 
         if start_l == end_l {
             // All out-of-order elements in the left block were moved. Move to the next block.
+
+            // block-width-guarantee
+            // SAFETY: if `!is_done` then the slice width is guaranteed to be at least `2*BLOCK` wide. There
+            // are at most `BLOCK` elements in `offsets_l` because of its size, so the `offset` operation is
+            // safe. Otherwise, the debug assertions in the `is_done` case guarantee that
+            // `width(l, r) == block_l + block_r`, namely, that the block sizes have been adjusted to account
+            // for the smaller number of remaining elements.
             l = unsafe { l.add(block_l) };
         }
 
         if start_r == end_r {
             // All out-of-order elements in the right block were moved. Move to the previous block.
-            r = unsafe { r.sub(block_r) };
+
+            // SAFETY: Same argument as [block-width-guarantee]. Either this is a full block `2*BLOCK`-wide,
+            // or `block_r` has been adjusted for the last handful of elements.
+            r = unsafe { r.offset(-(block_r as isize)) };
         }
 
         if is_done {
@@ -385,9 +448,20 @@
 
     if start_l < end_l {
         // The left block remains.
-        // Move it's remaining out-of-order elements to the far right.
+        // Move its remaining out-of-order elements to the far right.
         debug_assert_eq!(width(l, r), block_l);
         while start_l < end_l {
+            // remaining-elements-safety
+            // SAFETY: while the loop condition holds there are still elements in `offsets_l`, so it
+            // is safe to point `end_l` to the previous element.
+            //
+            // The `ptr::swap` is safe if both its arguments are valid for reads and writes:
+            //  - Per the debug assert above, the distance between `l` and `r` is `block_l`
+            //    elements, so there can be at most `block_l` remaining offsets between `start_l`
+            //    and `end_l`. This means `r` will be moved at most `block_l` steps back, which
+            //    makes the `r.offset` calls valid (at that point `l == r`).
+            //  - `offsets_l` contains valid offsets into `v` collected during the partitioning of
+            //    the last block, so the `l.offset` calls are valid.
             unsafe {
                 end_l = end_l.offset(-1);
                 ptr::swap(l.offset(*end_l as isize), r.offset(-1));
@@ -397,9 +471,10 @@
         width(v.as_mut_ptr(), r)
     } else if start_r < end_r {
         // The right block remains.
-        // Move it's remaining out-of-order elements to the far left.
+        // Move its remaining out-of-order elements to the far left.
         debug_assert_eq!(width(l, r), block_r);
         while start_r < end_r {
+            // SAFETY: See the reasoning in [remaining-elements-safety].
             unsafe {
                 end_r = end_r.offset(-1);
                 ptr::swap(l, r.offset(-(*end_r as isize) - 1));
@@ -432,17 +507,25 @@
 
         // Read the pivot into a stack-allocated variable for efficiency. If a following comparison
         // operation panics, the pivot will be automatically written back into the slice.
-        let write_on_drop = WriteOnDrop {
-            value: unsafe { Some(ptr::read(pivot)) },
+
+        // SAFETY: `pivot` is a reference to the first element of `v`, so `ptr::read` is safe.
+        let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
+        let _pivot_guard = CopyOnDrop {
+            src: &*tmp,
             dest: pivot,
         };
-        let pivot = write_on_drop.value.as_ref().unwrap();
+        let pivot = &*tmp;
 
         // Find the first pair of out-of-order elements.
         let mut l = 0;
         let mut r = v.len();
+
+        // SAFETY: The unsafety below involves indexing an array.
+        // For the first one: We already do the bounds checking here with `l < r`.
+        // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
+        //                     From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
         unsafe {
-            // Find the first element greater then or equal to the pivot.
+            // Find the first element greater than or equal to the pivot.
             while l < r && is_less(v.get_unchecked(l), pivot) {
                 l += 1;
             }
@@ -458,7 +541,7 @@
             l >= r,
         )
 
-        // `write_on_drop` goes out of scope and writes the pivot (which is a stack-allocated
+        // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated
         // variable) back into the slice where it originally was. This step is critical in ensuring
         // safety!
     };
@@ -484,18 +567,24 @@
 
     // Read the pivot into a stack-allocated variable for efficiency. If a following comparison
     // operation panics, the pivot will be automatically written back into the slice.
-    let write_on_drop = WriteOnDrop {
-        value: unsafe { Some(ptr::read(pivot)) },
+    // SAFETY: The pointer here is valid because it is obtained from a reference to a slice.
+    let tmp = mem::ManuallyDrop::new(unsafe { ptr::read(pivot) });
+    let _pivot_guard = CopyOnDrop {
+        src: &*tmp,
         dest: pivot,
     };
-    let pivot = write_on_drop.value.as_ref().unwrap();
+    let pivot = &*tmp;
 
     // Now partition the slice.
     let mut l = 0;
     let mut r = v.len();
     loop {
+        // SAFETY: The unsafety below involves indexing an array.
+        // For the first one: We already do the bounds checking here with `l < r`.
+        // For the second one: We initially have `l == 0` and `r == v.len()` and we checked that `l < r` at every indexing operation.
+        //                     From here we know that `r` must be at least `r == l` which was shown to be valid from the first one.
         unsafe {
-            // Find the first element greater that the pivot.
+            // Find the first element greater than the pivot.
             while l < r && !is_less(pivot, v.get_unchecked(l)) {
                 l += 1;
             }
@@ -512,7 +601,8 @@
 
             // Swap the found pair of out-of-order elements.
             r -= 1;
-            ptr::swap(v.get_unchecked_mut(l), v.get_unchecked_mut(r));
+            let ptr = v.as_mut_ptr();
+            ptr::swap(ptr.add(l), ptr.add(r));
             l += 1;
         }
     }
@@ -520,7 +610,7 @@
     // We found `l` elements equal to the pivot. Add 1 to account for the pivot itself.
     l + 1
 
-    // `write_on_drop` goes out of scope and writes the pivot (which is a stack-allocated variable)
+    // `_pivot_guard` goes out of scope and writes the pivot (which is a stack-allocated variable)
     // back into the slice where it originally was. This step is critical in ensuring safety!
 }
 
@@ -539,10 +629,10 @@
             random
         };
         let mut gen_usize = || {
-            if mem::size_of::<usize>() <= 4 {
+            if usize::BITS <= 32 {
                 gen_u32() as usize
             } else {
-                ((u64::from(gen_u32()) << 32) | u64::from(gen_u32())) as usize
+                (((gen_u32() as u64) << 32) | (gen_u32() as u64)) as usize
             }
         };
 
@@ -585,6 +675,7 @@
     let len = v.len();
 
     // Three indices near which we are going to choose a pivot.
+    #[allow(clippy::identity_op)]
     let mut a = len / 4 * 1;
     let mut b = len / 4 * 2;
     let mut c = len / 4 * 3;
@@ -594,6 +685,12 @@
 
     if len >= 8 {
         // Swaps indices so that `v[a] <= v[b]`.
+        // SAFETY: `len >= 8` so there are at least two elements in the neighborhoods of
+        // `a`, `b` and `c`. This means the three calls to `sort_adjacent` result in
+        // corresponding calls to `sort3` with valid 3-item neighborhoods around each
+        // pointer, which in turn means the calls to `sort2` are done with valid
+        // references. Thus the `v.get_unchecked` calls are safe, as is the `ptr::swap`
+        // call.
         let mut sort2 = |a: &mut usize, b: &mut usize| unsafe {
             if is_less(v.get_unchecked(*b), v.get_unchecked(*a)) {
                 ptr::swap(a, b);
@@ -641,7 +738,7 @@
 ///
 /// `limit` is the number of allowed imbalanced partitions before switching to `heapsort`. If zero,
 /// this function will immediately switch to heapsort.
-fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &F, mut pred: Option<&'a mut T>, mut limit: usize)
+fn recurse<'a, T, F>(mut v: &'a mut [T], is_less: &F, mut pred: Option<&'a mut T>, mut limit: u32)
 where
     T: Send,
     F: Fn(&T, &T) -> bool + Sync,
@@ -667,7 +764,7 @@
         }
 
         // If too many bad pivot choices were made, simply fall back to heapsort in order to
-        // guarantee `O(n log n)` worst-case.
+        // guarantee `O(n * log(n))` worst-case.
         if limit == 0 {
             heapsort(v, is_less);
             return;
@@ -701,7 +798,7 @@
                 let mid = partition_equal(v, pivot, is_less);
 
                 // Continue sorting elements greater than the pivot.
-                v = &mut { v }[mid..];
+                v = &mut v[mid..];
                 continue;
             }
         }
@@ -712,7 +809,7 @@
         was_partitioned = was_p;
 
         // Split the slice into `left`, `pivot`, and `right`.
-        let (left, right) = { v }.split_at_mut(mid);
+        let (left, right) = v.split_at_mut(mid);
         let (pivot, right) = right.split_at_mut(1);
         let pivot = &mut pivot[0];
 
@@ -741,7 +838,7 @@
 
 /// Sorts `v` using pattern-defeating quicksort in parallel.
 ///
-/// The algorithm is unstable, in-place, and `O(n log n)` worst-case.
+/// The algorithm is unstable, in-place, and *O*(*n* \* log(*n*)) worst-case.
 pub(super) fn par_quicksort<T, F>(v: &mut [T], is_less: F)
 where
     T: Send,
@@ -753,7 +850,7 @@
     }
 
     // Limit the number of imbalanced partitions to `floor(log2(len)) + 1`.
-    let limit = mem::size_of::<usize>() * 8 - v.len().leading_zeros() as usize;
+    let limit = usize::BITS - v.len().leading_zeros();
 
     recurse(v, &is_less, None, limit);
 }
@@ -766,7 +863,7 @@
 
     #[test]
     fn test_heapsort() {
-        let ref mut rng = thread_rng();
+        let rng = &mut thread_rng();
 
         for len in (0..25).chain(500..501) {
             for &modulus in &[5, 10, 100] {
@@ -793,8 +890,8 @@
         heapsort(&mut v, &|_, _| thread_rng().gen());
         heapsort(&mut v, &|a, b| a < b);
 
-        for i in 0..v.len() {
-            assert_eq!(v[i], i);
+        for (i, &entry) in v.iter().enumerate() {
+            assert_eq!(entry, i);
         }
     }
 }
diff --git a/src/slice/rchunks.rs b/src/slice/rchunks.rs
new file mode 100644
index 0000000..63e765d
--- /dev/null
+++ b/src/slice/rchunks.rs
@@ -0,0 +1,386 @@
+use crate::iter::plumbing::*;
+use crate::iter::*;
+use crate::math::div_round_up;
+
+/// Parallel iterator over immutable non-overlapping chunks of a slice, starting at the end.
+#[derive(Debug)]
+pub struct RChunks<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: Sync> RChunks<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data [T]) -> Self {
+        Self { chunk_size, slice }
+    }
+}
+
+impl<'data, T: Sync> Clone for RChunks<'data, T> {
+    fn clone(&self) -> Self {
+        RChunks { ..*self }
+    }
+}
+
+impl<'data, T: Sync + 'data> ParallelIterator for RChunks<'data, T> {
+    type Item = &'data [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Sync + 'data> IndexedParallelIterator for RChunks<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        div_round_up(self.slice.len(), self.chunk_size)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(RChunksProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct RChunksProducer<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: 'data + Sync> Producer for RChunksProducer<'data, T> {
+    type Item = &'data [T];
+    type IntoIter = ::std::slice::RChunks<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.rchunks(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = self.slice.len().saturating_sub(index * self.chunk_size);
+        let (left, right) = self.slice.split_at(elem_index);
+        (
+            RChunksProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+            RChunksProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over immutable non-overlapping chunks of a slice, starting at the end.
+#[derive(Debug)]
+pub struct RChunksExact<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+    rem: &'data [T],
+}
+
+impl<'data, T: Sync> RChunksExact<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data [T]) -> Self {
+        let rem_len = slice.len() % chunk_size;
+        let (rem, slice) = slice.split_at(rem_len);
+        Self {
+            chunk_size,
+            slice,
+            rem,
+        }
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    pub fn remainder(&self) -> &'data [T] {
+        self.rem
+    }
+}
+
+impl<'data, T: Sync> Clone for RChunksExact<'data, T> {
+    fn clone(&self) -> Self {
+        RChunksExact { ..*self }
+    }
+}
+
+impl<'data, T: Sync + 'data> ParallelIterator for RChunksExact<'data, T> {
+    type Item = &'data [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Sync + 'data> IndexedParallelIterator for RChunksExact<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        self.slice.len() / self.chunk_size
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(RChunksExactProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct RChunksExactProducer<'data, T: Sync> {
+    chunk_size: usize,
+    slice: &'data [T],
+}
+
+impl<'data, T: 'data + Sync> Producer for RChunksExactProducer<'data, T> {
+    type Item = &'data [T];
+    type IntoIter = ::std::slice::RChunksExact<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.rchunks_exact(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = self.slice.len() - index * self.chunk_size;
+        let (left, right) = self.slice.split_at(elem_index);
+        (
+            RChunksExactProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+            RChunksExactProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over mutable non-overlapping chunks of a slice, starting at the end.
+#[derive(Debug)]
+pub struct RChunksMut<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: Send> RChunksMut<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data mut [T]) -> Self {
+        Self { chunk_size, slice }
+    }
+}
+
+impl<'data, T: Send + 'data> ParallelIterator for RChunksMut<'data, T> {
+    type Item = &'data mut [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Send + 'data> IndexedParallelIterator for RChunksMut<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        div_round_up(self.slice.len(), self.chunk_size)
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(RChunksMutProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct RChunksMutProducer<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: 'data + Send> Producer for RChunksMutProducer<'data, T> {
+    type Item = &'data mut [T];
+    type IntoIter = ::std::slice::RChunksMut<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.rchunks_mut(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = self.slice.len().saturating_sub(index * self.chunk_size);
+        let (left, right) = self.slice.split_at_mut(elem_index);
+        (
+            RChunksMutProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+            RChunksMutProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+        )
+    }
+}
+
+/// Parallel iterator over mutable non-overlapping chunks of a slice, starting at the end.
+#[derive(Debug)]
+pub struct RChunksExactMut<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+    rem: &'data mut [T],
+}
+
+impl<'data, T: Send> RChunksExactMut<'data, T> {
+    pub(super) fn new(chunk_size: usize, slice: &'data mut [T]) -> Self {
+        let rem_len = slice.len() % chunk_size;
+        let (rem, slice) = slice.split_at_mut(rem_len);
+        Self {
+            chunk_size,
+            slice,
+            rem,
+        }
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    ///
+    /// Note that this has to consume `self` to return the original lifetime of
+    /// the data, which prevents this from actually being used as a parallel
+    /// iterator since that also consumes. This method is provided for parity
+    /// with `std::iter::RChunksExactMut`, but consider calling `remainder()` or
+    /// `take_remainder()` as alternatives.
+    pub fn into_remainder(self) -> &'data mut [T] {
+        self.rem
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements.
+    ///
+    /// Consider `take_remainder()` if you need access to the data with its
+    /// original lifetime, rather than borrowing through `&mut self` here.
+    pub fn remainder(&mut self) -> &mut [T] {
+        self.rem
+    }
+
+    /// Return the remainder of the original slice that is not going to be
+    /// returned by the iterator. The returned slice has at most `chunk_size-1`
+    /// elements. Subsequent calls will return an empty slice.
+    pub fn take_remainder(&mut self) -> &'data mut [T] {
+        std::mem::take(&mut self.rem)
+    }
+}
+
+impl<'data, T: Send + 'data> ParallelIterator for RChunksExactMut<'data, T> {
+    type Item = &'data mut [T];
+
+    fn drive_unindexed<C>(self, consumer: C) -> C::Result
+    where
+        C: UnindexedConsumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn opt_len(&self) -> Option<usize> {
+        Some(self.len())
+    }
+}
+
+impl<'data, T: Send + 'data> IndexedParallelIterator for RChunksExactMut<'data, T> {
+    fn drive<C>(self, consumer: C) -> C::Result
+    where
+        C: Consumer<Self::Item>,
+    {
+        bridge(self, consumer)
+    }
+
+    fn len(&self) -> usize {
+        self.slice.len() / self.chunk_size
+    }
+
+    fn with_producer<CB>(self, callback: CB) -> CB::Output
+    where
+        CB: ProducerCallback<Self::Item>,
+    {
+        callback.callback(RChunksExactMutProducer {
+            chunk_size: self.chunk_size,
+            slice: self.slice,
+        })
+    }
+}
+
+struct RChunksExactMutProducer<'data, T: Send> {
+    chunk_size: usize,
+    slice: &'data mut [T],
+}
+
+impl<'data, T: 'data + Send> Producer for RChunksExactMutProducer<'data, T> {
+    type Item = &'data mut [T];
+    type IntoIter = ::std::slice::RChunksExactMut<'data, T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.slice.rchunks_exact_mut(self.chunk_size)
+    }
+
+    fn split_at(self, index: usize) -> (Self, Self) {
+        let elem_index = self.slice.len() - index * self.chunk_size;
+        let (left, right) = self.slice.split_at_mut(elem_index);
+        (
+            RChunksExactMutProducer {
+                chunk_size: self.chunk_size,
+                slice: right,
+            },
+            RChunksExactMutProducer {
+                chunk_size: self.chunk_size,
+                slice: left,
+            },
+        )
+    }
+}
diff --git a/src/slice/test.rs b/src/slice/test.rs
index 71743e2..f74ca0f 100644
--- a/src/slice/test.rs
+++ b/src/slice/test.rs
@@ -10,7 +10,7 @@
     ($f:ident, $name:ident) => {
         #[test]
         fn $name() {
-            let ref mut rng = thread_rng();
+            let rng = &mut thread_rng();
 
             for len in (0..25).chain(500..501) {
                 for &modulus in &[5, 10, 100] {
@@ -146,3 +146,25 @@
     assert_eq!(c.take_remainder(), &[]);
     assert_eq!(c.len(), 2);
 }
+
+#[test]
+fn test_par_rchunks_exact_remainder() {
+    let v: &[i32] = &[0, 1, 2, 3, 4];
+    let c = v.par_rchunks_exact(2);
+    assert_eq!(c.remainder(), &[0]);
+    assert_eq!(c.len(), 2);
+}
+
+#[test]
+fn test_par_rchunks_exact_mut_remainder() {
+    let v: &mut [i32] = &mut [0, 1, 2, 3, 4];
+    let mut c = v.par_rchunks_exact_mut(2);
+    assert_eq!(c.remainder(), &[0]);
+    assert_eq!(c.len(), 2);
+    assert_eq!(c.into_remainder(), &[0]);
+
+    let mut c = v.par_rchunks_exact_mut(2);
+    assert_eq!(c.take_remainder(), &[0]);
+    assert_eq!(c.take_remainder(), &[]);
+    assert_eq!(c.len(), 2);
+}
diff --git a/src/str.rs b/src/str.rs
index 2fdaaa7..c85754e 100644
--- a/src/str.rs
+++ b/src/str.rs
@@ -6,7 +6,8 @@
 //! Note: [`ParallelString::par_split()`] and [`par_split_terminator()`]
 //! reference a `Pattern` trait which is not visible outside this crate.
 //! This trait is intentionally kept private, for use only by Rayon itself.
-//! It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`.
+//! It is implemented for `char`, `&[char]`, and any function or closure
+//! `F: Fn(char) -> bool + Sync + Send`.
 //!
 //! [`ParallelString::par_split()`]: trait.ParallelString.html#method.par_split
 //! [`par_split_terminator()`]: trait.ParallelString.html#method.par_split_terminator
@@ -34,11 +35,11 @@
     // character boundary.  So we look at the raw bytes, first scanning
     // forward from the midpoint for a boundary, then trying backward.
     let (left, right) = chars.as_bytes().split_at(mid);
-    match right.iter().cloned().position(is_char_boundary) {
+    match right.iter().copied().position(is_char_boundary) {
         Some(i) => mid + i,
         None => left
             .iter()
-            .cloned()
+            .copied()
             .rposition(is_char_boundary)
             .unwrap_or(0),
     }
@@ -96,7 +97,7 @@
     /// Note that multi-byte sequences (for code points greater than `U+007F`)
     /// are produced as separate items, but will not be split across threads.
     /// If you would prefer an indexed iterator without that guarantee, consider
-    /// `string.as_bytes().par_iter().cloned()` instead.
+    /// `string.as_bytes().par_iter().copied()` instead.
     ///
     /// # Examples
     ///
@@ -139,7 +140,8 @@
     /// given character or predicate, similar to `str::split`.
     ///
     /// Note: the `Pattern` trait is private, for use only by Rayon itself.
-    /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`.
+    /// It is implemented for `char`, `&[char]`, and any function or closure
+    /// `F: Fn(char) -> bool + Sync + Send`.
     ///
     /// # Examples
     ///
@@ -161,7 +163,8 @@
     /// substring after a trailing terminator.
     ///
     /// Note: the `Pattern` trait is private, for use only by Rayon itself.
-    /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`.
+    /// It is implemented for `char`, `&[char]`, and any function or closure
+    /// `F: Fn(char) -> bool + Sync + Send`.
     ///
     /// # Examples
     ///
@@ -218,7 +221,8 @@
     /// given character or predicate, similar to `str::matches`.
     ///
     /// Note: the `Pattern` trait is private, for use only by Rayon itself.
-    /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`.
+    /// It is implemented for `char`, `&[char]`, and any function or closure
+    /// `F: Fn(char) -> bool + Sync + Send`.
     ///
     /// # Examples
     ///
@@ -241,7 +245,8 @@
     /// or predicate, with their positions, similar to `str::match_indices`.
     ///
     /// Note: the `Pattern` trait is private, for use only by Rayon itself.
-    /// It is implemented for `char` and any `F: Fn(char) -> bool + Sync + Send`.
+    /// It is implemented for `char`, `&[char]`, and any function or closure
+    /// `F: Fn(char) -> bool + Sync + Send`.
     ///
     /// # Examples
     ///
@@ -303,89 +308,62 @@
     move |(i, x)| (base + i, x)
 }
 
-impl Pattern for char {
-    private_impl! {}
+macro_rules! impl_pattern {
+    (&$self:ident => $pattern:expr) => {
+        private_impl! {}
 
-    #[inline]
-    fn find_in(&self, chars: &str) -> Option<usize> {
-        chars.find(*self)
-    }
-
-    #[inline]
-    fn rfind_in(&self, chars: &str) -> Option<usize> {
-        chars.rfind(*self)
-    }
-
-    #[inline]
-    fn is_suffix_of(&self, chars: &str) -> bool {
-        chars.ends_with(*self)
-    }
-
-    fn fold_splits<'ch, F>(&self, chars: &'ch str, folder: F, skip_last: bool) -> F
-    where
-        F: Folder<&'ch str>,
-    {
-        let mut split = chars.split(*self);
-        if skip_last {
-            split.next_back();
+        #[inline]
+        fn find_in(&$self, chars: &str) -> Option<usize> {
+            chars.find($pattern)
         }
-        folder.consume_iter(split)
-    }
 
-    fn fold_matches<'ch, F>(&self, chars: &'ch str, folder: F) -> F
-    where
-        F: Folder<&'ch str>,
-    {
-        folder.consume_iter(chars.matches(*self))
-    }
+        #[inline]
+        fn rfind_in(&$self, chars: &str) -> Option<usize> {
+            chars.rfind($pattern)
+        }
 
-    fn fold_match_indices<'ch, F>(&self, chars: &'ch str, folder: F, base: usize) -> F
-    where
-        F: Folder<(usize, &'ch str)>,
-    {
-        folder.consume_iter(chars.match_indices(*self).map(offset(base)))
+        #[inline]
+        fn is_suffix_of(&$self, chars: &str) -> bool {
+            chars.ends_with($pattern)
+        }
+
+        fn fold_splits<'ch, F>(&$self, chars: &'ch str, folder: F, skip_last: bool) -> F
+        where
+            F: Folder<&'ch str>,
+        {
+            let mut split = chars.split($pattern);
+            if skip_last {
+                split.next_back();
+            }
+            folder.consume_iter(split)
+        }
+
+        fn fold_matches<'ch, F>(&$self, chars: &'ch str, folder: F) -> F
+        where
+            F: Folder<&'ch str>,
+        {
+            folder.consume_iter(chars.matches($pattern))
+        }
+
+        fn fold_match_indices<'ch, F>(&$self, chars: &'ch str, folder: F, base: usize) -> F
+        where
+            F: Folder<(usize, &'ch str)>,
+        {
+            folder.consume_iter(chars.match_indices($pattern).map(offset(base)))
+        }
     }
 }
 
+impl Pattern for char {
+    impl_pattern!(&self => *self);
+}
+
+impl Pattern for &[char] {
+    impl_pattern!(&self => *self);
+}
+
 impl<FN: Sync + Send + Fn(char) -> bool> Pattern for FN {
-    private_impl! {}
-
-    fn find_in(&self, chars: &str) -> Option<usize> {
-        chars.find(self)
-    }
-
-    fn rfind_in(&self, chars: &str) -> Option<usize> {
-        chars.rfind(self)
-    }
-
-    fn is_suffix_of(&self, chars: &str) -> bool {
-        chars.ends_with(self)
-    }
-
-    fn fold_splits<'ch, F>(&self, chars: &'ch str, folder: F, skip_last: bool) -> F
-    where
-        F: Folder<&'ch str>,
-    {
-        let mut split = chars.split(self);
-        if skip_last {
-            split.next_back();
-        }
-        folder.consume_iter(split)
-    }
-
-    fn fold_matches<'ch, F>(&self, chars: &'ch str, folder: F) -> F
-    where
-        F: Folder<&'ch str>,
-    {
-        folder.consume_iter(chars.matches(self))
-    }
-
-    fn fold_match_indices<'ch, F>(&self, chars: &'ch str, folder: F, base: usize) -> F
-    where
-        F: Folder<(usize, &'ch str)>,
-    {
-        folder.consume_iter(chars.match_indices(self).map(offset(base)))
-    }
+    impl_pattern!(&self => self);
 }
 
 // /////////////////////////////////////////////////////////////////////////
@@ -711,11 +689,7 @@
 
 #[inline]
 fn no_carriage_return(line: &str) -> &str {
-    if line.ends_with('\r') {
-        &line[..line.len() - 1]
-    } else {
-        line
-    }
+    line.strip_suffix('\r').unwrap_or(line)
 }
 
 impl<'ch> ParallelIterator for Lines<'ch> {
diff --git a/src/vec.rs b/src/vec.rs
index 1e68aa0..7892f53 100644
--- a/src/vec.rs
+++ b/src/vec.rs
@@ -138,16 +138,10 @@
     {
         unsafe {
             // Make the vector forget about the drained items, and temporarily the tail too.
-            let start = self.range.start;
-            self.vec.set_len(start);
+            self.vec.set_len(self.range.start);
 
             // Create the producer as the exclusive "owner" of the slice.
-            let producer = {
-                // Get a correct borrow lifetime, then extend it to the original length.
-                let mut slice = &mut self.vec[start..];
-                slice = slice::from_raw_parts_mut(slice.as_mut_ptr(), self.range.len());
-                DrainProducer::new(slice)
-            };
+            let producer = DrainProducer::from_vec(self.vec, self.range.len());
 
             // The producer will move or drop each item from the drained range.
             callback.callback(producer)
@@ -157,22 +151,24 @@
 
 impl<'data, T: Send> Drop for Drain<'data, T> {
     fn drop(&mut self) {
-        if self.range.len() > 0 {
-            let Range { start, end } = self.range;
-            if self.vec.len() != start {
-                // We must not have produced, so just call a normal drain to remove the items.
-                assert_eq!(self.vec.len(), self.orig_len);
-                self.vec.drain(start..end);
-            } else if end < self.orig_len {
-                // The producer was responsible for consuming the drained items.
-                // Move the tail items to their new place, then set the length to include them.
-                unsafe {
-                    let ptr = self.vec.as_mut_ptr().add(start);
-                    let tail_ptr = self.vec.as_ptr().add(end);
-                    let tail_len = self.orig_len - end;
-                    ptr::copy(tail_ptr, ptr, tail_len);
-                    self.vec.set_len(start + tail_len);
-                }
+        let Range { start, end } = self.range;
+        if self.vec.len() == self.orig_len {
+            // We must not have produced, so just call a normal drain to remove the items.
+            self.vec.drain(start..end);
+        } else if start == end {
+            // Empty range, so just restore the length to its original state
+            unsafe {
+                self.vec.set_len(self.orig_len);
+            }
+        } else if end < self.orig_len {
+            // The producer was responsible for consuming the drained items.
+            // Move the tail items to their new place, then set the length to include them.
+            unsafe {
+                let ptr = self.vec.as_mut_ptr().add(start);
+                let tail_ptr = self.vec.as_ptr().add(end);
+                let tail_len = self.orig_len - end;
+                ptr::copy(tail_ptr, ptr, tail_len);
+                self.vec.set_len(start + tail_len);
             }
         }
     }
@@ -184,13 +180,27 @@
     slice: &'data mut [T],
 }
 
-impl<'data, T: 'data + Send> DrainProducer<'data, T> {
+impl<T: Send> DrainProducer<'_, T> {
     /// Creates a draining producer, which *moves* items from the slice.
     ///
-    /// Unsafe bacause `!Copy` data must not be read after the borrow is released.
-    pub(crate) unsafe fn new(slice: &'data mut [T]) -> Self {
+    /// Unsafe because `!Copy` data must not be read after the borrow is released.
+    pub(crate) unsafe fn new(slice: &mut [T]) -> DrainProducer<'_, T> {
         DrainProducer { slice }
     }
+
+    /// Creates a draining producer, which *moves* items from the tail of the vector.
+    ///
+    /// Unsafe because we're moving from beyond `vec.len()`, so the caller must ensure
+    /// that data is initialized and not read after the borrow is released.
+    unsafe fn from_vec(vec: &mut Vec<T>, len: usize) -> DrainProducer<'_, T> {
+        let start = vec.len();
+        assert!(vec.capacity() - start >= len);
+
+        // The pointer is derived from `Vec` directly, not through a `Deref`,
+        // so it has provenance over the whole allocation.
+        let ptr = vec.as_mut_ptr().add(start);
+        DrainProducer::new(slice::from_raw_parts_mut(ptr, len))
+    }
 }
 
 impl<'data, T: 'data + Send> Producer for DrainProducer<'data, T> {
@@ -199,7 +209,7 @@
 
     fn into_iter(mut self) -> Self::IntoIter {
         // replace the slice so we don't drop it twice
-        let slice = mem::replace(&mut self.slice, &mut []);
+        let slice = mem::take(&mut self.slice);
         SliceDrain {
             iter: slice.iter_mut(),
         }
@@ -207,7 +217,7 @@
 
     fn split_at(mut self, index: usize) -> (Self, Self) {
         // replace the slice so we don't drop it twice
-        let slice = mem::replace(&mut self.slice, &mut []);
+        let slice = mem::take(&mut self.slice);
         let (left, right) = slice.split_at_mut(index);
         unsafe { (DrainProducer::new(left), DrainProducer::new(right)) }
     }
@@ -215,8 +225,9 @@
 
 impl<'data, T: 'data + Send> Drop for DrainProducer<'data, T> {
     fn drop(&mut self) {
-        // use `Drop for [T]`
-        unsafe { ptr::drop_in_place(self.slice) };
+        // extract the slice so we can use `Drop for [T]`
+        let slice_ptr: *mut [T] = mem::take::<&'data mut [T]>(&mut self.slice);
+        unsafe { ptr::drop_in_place::<[T]>(slice_ptr) };
     }
 }
 
@@ -266,7 +277,7 @@
 impl<'data, T: 'data> Drop for SliceDrain<'data, T> {
     fn drop(&mut self) {
         // extract the iterator so we can use `Drop for [T]`
-        let iter = mem::replace(&mut self.iter, [].iter_mut());
-        unsafe { ptr::drop_in_place(iter.into_slice()) };
+        let slice_ptr: *mut [T] = mem::replace(&mut self.iter, [].iter_mut()).into_slice();
+        unsafe { ptr::drop_in_place::<[T]>(slice_ptr) };
     }
 }
diff --git a/tests/clones.rs b/tests/clones.rs
index 9add775..0d6c864 100644
--- a/tests/clones.rs
+++ b/tests/clones.rs
@@ -10,6 +10,13 @@
     assert_eq!(a, b);
 }
 
+fn check_count<I>(iter: I)
+where
+    I: ParallelIterator + Clone,
+{
+    assert_eq!(iter.clone().count(), iter.count());
+}
+
 #[test]
 fn clone_binary_heap() {
     use std::collections::BinaryHeap;
@@ -102,6 +109,8 @@
     check(v.par_iter());
     check(v.par_chunks(42));
     check(v.par_chunks_exact(42));
+    check(v.par_rchunks(42));
+    check(v.par_rchunks_exact(42));
     check(v.par_windows(42));
     check(v.par_split(|x| x % 3 == 0));
     check(v.into_par_iter());
@@ -128,6 +137,12 @@
     check(v.par_iter().flatten_iter());
     check(v.par_iter().with_max_len(1).fold(|| 0, |x, _| x));
     check(v.par_iter().with_max_len(1).fold_with(0, |x, _| x));
+    check(v.par_iter().with_max_len(1).fold_chunks(1, || 0, |x, _| x));
+    check(
+        v.par_iter()
+            .with_max_len(1)
+            .fold_chunks_with(1, 0, |x, _| x),
+    );
     check(v.par_iter().with_max_len(1).try_fold(|| 0, |_, &x| x));
     check(v.par_iter().with_max_len(1).try_fold_with(0, |_, &x| x));
     check(v.par_iter().inspect(|_| ()));
@@ -142,8 +157,10 @@
     check(v.par_iter().panic_fuse());
     check(v.par_iter().positions(|_| true));
     check(v.par_iter().rev());
-    check(v.par_iter().skip(1));
-    check(v.par_iter().take(1));
+    check(v.par_iter().skip(42));
+    check(v.par_iter().skip_any_while(|_| false));
+    check(v.par_iter().take(42));
+    check(v.par_iter().take_any_while(|_| true));
     check(v.par_iter().cloned().while_some());
     check(v.par_iter().with_max_len(1));
     check(v.par_iter().with_min_len(1));
@@ -153,6 +170,13 @@
 }
 
 #[test]
+fn clone_counted_adaptors() {
+    let v: Vec<_> = (0..1000).collect();
+    check_count(v.par_iter().skip_any(42));
+    check_count(v.par_iter().take_any(42));
+}
+
+#[test]
 fn clone_empty() {
     check(rayon::iter::empty::<i32>());
 }
diff --git a/tests/collect.rs b/tests/collect.rs
index 48b80f6..bfb080c 100644
--- a/tests/collect.rs
+++ b/tests/collect.rs
@@ -6,6 +6,7 @@
 use std::sync::Mutex;
 
 #[test]
+#[cfg_attr(not(panic = "unwind"), ignore)]
 fn collect_drop_on_unwind() {
     struct Recorddrop<'a>(i64, &'a Mutex<Vec<i64>>);
 
@@ -61,6 +62,7 @@
 }
 
 #[test]
+#[cfg_attr(not(panic = "unwind"), ignore)]
 fn collect_drop_on_unwind_zst() {
     static INSERTS: AtomicUsize = AtomicUsize::new(0);
     static DROPS: AtomicUsize = AtomicUsize::new(0);
diff --git a/tests/cross-pool.rs b/tests/cross-pool.rs
index f0a2128..835e2e2 100644
--- a/tests/cross-pool.rs
+++ b/tests/cross-pool.rs
@@ -2,6 +2,7 @@
 use rayon::ThreadPoolBuilder;
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn cross_pool_busy() {
     let pool1 = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
     let pool2 = ThreadPoolBuilder::new().num_threads(1).build().unwrap();
diff --git a/tests/debug.rs b/tests/debug.rs
index 3584ba0..14f3791 100644
--- a/tests/debug.rs
+++ b/tests/debug.rs
@@ -123,6 +123,10 @@
     check(v.par_chunks_exact(42));
     check(v.par_chunks_mut(42));
     check(v.par_chunks_exact_mut(42));
+    check(v.par_rchunks(42));
+    check(v.par_rchunks_exact(42));
+    check(v.par_rchunks_mut(42));
+    check(v.par_rchunks_exact_mut(42));
     check(v.par_windows(42));
     check(v.par_split(|x| x % 3 == 0));
     check(v.par_split_mut(|x| x % 3 == 0));
@@ -151,6 +155,8 @@
     check(v.par_iter().map(Some).flatten_iter());
     check(v.par_iter().fold(|| 0, |x, _| x));
     check(v.par_iter().fold_with(0, |x, _| x));
+    check(v.par_iter().fold_chunks(3, || 0, |x, _| x));
+    check(v.par_iter().fold_chunks_with(3, 0, |x, _| x));
     check(v.par_iter().try_fold(|| 0, |x, _| Some(x)));
     check(v.par_iter().try_fold_with(0, |x, _| Some(x)));
     check(v.par_iter().inspect(|_| ()));
@@ -166,7 +172,11 @@
     check(v.par_iter().positions(|_| true));
     check(v.par_iter().rev());
     check(v.par_iter().skip(1));
+    check(v.par_iter().skip_any(1));
+    check(v.par_iter().skip_any_while(|_| false));
     check(v.par_iter().take(1));
+    check(v.par_iter().take_any(1));
+    check(v.par_iter().take_any_while(|_| true));
     check(v.par_iter().map(Some).while_some());
     check(v.par_iter().with_max_len(1));
     check(v.par_iter().with_min_len(1));
diff --git a/tests/drain_vec.rs b/tests/drain_vec.rs
new file mode 100644
index 0000000..08f1120
--- /dev/null
+++ b/tests/drain_vec.rs
@@ -0,0 +1,41 @@
+use rayon::prelude::*;
+
+#[test]
+fn drain_vec_yielded() {
+    let mut vec_org = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+    let yielded = vec_org.par_drain(0..5).collect::<Vec<_>>();
+
+    assert_eq!(&yielded, &[0, 1, 2, 3, 4]);
+    assert_eq!(&vec_org, &[5, 6, 7, 8, 9]);
+}
+
+#[test]
+fn drain_vec_dropped() {
+    let mut vec_org = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+    let yielded = vec_org.par_drain(0..5);
+
+    drop(yielded);
+    assert_eq!(&vec_org, &[5, 6, 7, 8, 9]);
+}
+
+#[test]
+fn drain_vec_empty_range_yielded() {
+    let mut vec_org = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+    let yielded = vec_org.par_drain(5..5).collect::<Vec<_>>();
+
+    assert_eq!(&yielded, &[]);
+    assert_eq!(&vec_org, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+}
+
+#[test]
+fn drain_vec_empty_range_dropped() {
+    let mut vec_org = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+    let yielded = vec_org.par_drain(5..5);
+
+    drop(yielded);
+    assert_eq!(&vec_org, &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+}
diff --git a/tests/iter_panic.rs b/tests/iter_panic.rs
index 4885a28..c62a8db 100644
--- a/tests/iter_panic.rs
+++ b/tests/iter_panic.rs
@@ -20,6 +20,7 @@
 }
 
 #[test]
+#[cfg_attr(not(panic = "unwind"), ignore)]
 fn iter_panic_fuse() {
     // We only use a single thread in order to make the behavior
     // of 'panic_fuse' deterministic
@@ -47,6 +48,6 @@
         assert!(count(iter.clone().panic_fuse().inspect(check)) < expected);
 
         // Try in reverse to be sure we hit the producer case.
-        assert!(count(iter.clone().panic_fuse().inspect(check).rev()) < expected);
+        assert!(count(iter.panic_fuse().inspect(check).rev()) < expected);
     });
 }
diff --git a/tests/named-threads.rs b/tests/named-threads.rs
index fd1b0be..dadb37b 100644
--- a/tests/named-threads.rs
+++ b/tests/named-threads.rs
@@ -4,6 +4,7 @@
 use rayon::*;
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn named_threads() {
     ThreadPoolBuilder::new()
         .thread_name(|i| format!("hello-name-test-{}", i))
diff --git a/tests/octillion.rs b/tests/octillion.rs
index cff2b11..1af9ad8 100644
--- a/tests/octillion.rs
+++ b/tests/octillion.rs
@@ -68,7 +68,14 @@
 }
 
 #[test]
-#[cfg_attr(not(target_pointer_width = "64"), ignore)]
+#[cfg_attr(
+    any(
+        not(target_pointer_width = "64"),
+        target_os = "emscripten",
+        target_family = "wasm"
+    ),
+    ignore
+)]
 fn find_last_octillion() {
     // It would be nice if `find_last` could prioritize the later splits,
     // basically flipping the `join` args, without needing indexed `rev`.
@@ -78,32 +85,49 @@
 }
 
 #[test]
-#[cfg_attr(not(target_pointer_width = "64"), ignore)]
+#[cfg_attr(
+    any(
+        not(target_pointer_width = "64"),
+        target_os = "emscripten",
+        target_family = "wasm"
+    ),
+    ignore
+)]
 fn find_last_octillion_inclusive() {
     let x = two_threads(|| octillion_inclusive().find_last(|_| true));
     assert_eq!(x, Some(OCTILLION));
 }
 
 #[test]
-#[cfg_attr(not(target_pointer_width = "64"), ignore)]
+#[cfg_attr(
+    any(
+        not(target_pointer_width = "64"),
+        target_os = "emscripten",
+        target_family = "wasm"
+    ),
+    ignore
+)]
 fn find_last_octillion_flat() {
     let x = two_threads(|| octillion_flat().find_last(|_| true));
     assert_eq!(x, Some(OCTILLION - 1));
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn find_any_octillion() {
     let x = two_threads(|| octillion().find_any(|x| *x > OCTILLION / 2));
     assert!(x.is_some());
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn find_any_octillion_flat() {
     let x = two_threads(|| octillion_flat().find_any(|x| *x > OCTILLION / 2));
     assert!(x.is_some());
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn filter_find_any_octillion() {
     let x = two_threads(|| {
         octillion()
@@ -114,6 +138,7 @@
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn filter_find_any_octillion_flat() {
     let x = two_threads(|| {
         octillion_flat()
@@ -124,6 +149,7 @@
 }
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn fold_find_any_octillion_flat() {
     let x = two_threads(|| octillion_flat().fold(|| (), |_, _| ()).find_any(|_| true));
     assert!(x.is_some());
diff --git a/tests/par_bridge_recursion.rs b/tests/par_bridge_recursion.rs
new file mode 100644
index 0000000..3a48ef1
--- /dev/null
+++ b/tests/par_bridge_recursion.rs
@@ -0,0 +1,31 @@
+use rayon::prelude::*;
+use std::iter::once_with;
+
+const N: usize = 100_000;
+
+#[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
+fn par_bridge_recursion() {
+    let pool = rayon::ThreadPoolBuilder::new()
+        .num_threads(10)
+        .build()
+        .unwrap();
+
+    let seq: Vec<_> = (0..N).map(|i| (i, i.to_string())).collect();
+
+    pool.broadcast(|_| {
+        let mut par: Vec<_> = (0..N)
+            .into_par_iter()
+            .flat_map(|i| {
+                once_with(move || {
+                    // Using rayon within the serial iterator creates an opportunity for
+                    // work-stealing to make par_bridge's mutex accidentally recursive.
+                    rayon::join(move || i, move || i.to_string())
+                })
+                .par_bridge()
+            })
+            .collect();
+        par.par_sort_unstable();
+        assert_eq!(seq, par);
+    });
+}
diff --git a/tests/producer_split_at.rs b/tests/producer_split_at.rs
index 3a49059..d710504 100644
--- a/tests/producer_split_at.rs
+++ b/tests/producer_split_at.rs
@@ -214,6 +214,50 @@
 }
 
 #[test]
+fn slice_rchunks() {
+    let s: Vec<_> = (0..10).collect();
+    for len in 1..s.len() + 2 {
+        let v: Vec<_> = s.rchunks(len).collect();
+        check(&v, || s.par_rchunks(len));
+    }
+}
+
+#[test]
+fn slice_rchunks_exact() {
+    let s: Vec<_> = (0..10).collect();
+    for len in 1..s.len() + 2 {
+        let v: Vec<_> = s.rchunks_exact(len).collect();
+        check(&v, || s.par_rchunks_exact(len));
+    }
+}
+
+#[test]
+fn slice_rchunks_mut() {
+    let mut s: Vec<_> = (0..10).collect();
+    let mut v: Vec<_> = s.clone();
+    for len in 1..s.len() + 2 {
+        let expected: Vec<_> = v.rchunks_mut(len).collect();
+        map_triples(expected.len() + 1, |i, j, k| {
+            Split::forward(s.par_rchunks_mut(len), i, j, k, &expected);
+            Split::reverse(s.par_rchunks_mut(len), i, j, k, &expected);
+        });
+    }
+}
+
+#[test]
+fn slice_rchunks_exact_mut() {
+    let mut s: Vec<_> = (0..10).collect();
+    let mut v: Vec<_> = s.clone();
+    for len in 1..s.len() + 2 {
+        let expected: Vec<_> = v.rchunks_exact_mut(len).collect();
+        map_triples(expected.len() + 1, |i, j, k| {
+            Split::forward(s.par_rchunks_exact_mut(len), i, j, k, &expected);
+            Split::reverse(s.par_rchunks_exact_mut(len), i, j, k, &expected);
+        });
+    }
+}
+
+#[test]
 fn slice_windows() {
     let s: Vec<_> = (0..10).collect();
     let v: Vec<_> = s.windows(2).collect();
diff --git a/tests/sort-panic-safe.rs b/tests/sort-panic-safe.rs
index 00a9731..95ef88d 100644
--- a/tests/sort-panic-safe.rs
+++ b/tests/sort-panic-safe.rs
@@ -8,11 +8,12 @@
 use std::sync::atomic::Ordering::Relaxed;
 use std::thread;
 
-static VERSIONS: AtomicUsize = AtomicUsize::new(0);
+const ZERO: AtomicUsize = AtomicUsize::new(0);
+const LEN: usize = 20_000;
 
-lazy_static::lazy_static! {
-    static ref DROP_COUNTS: Vec<AtomicUsize> = (0..20_000).map(|_| AtomicUsize::new(0)).collect();
-}
+static VERSIONS: AtomicUsize = ZERO;
+
+static DROP_COUNTS: [AtomicUsize; LEN] = [ZERO; LEN];
 
 #[derive(Clone, Eq)]
 struct DropCounter {
@@ -117,6 +118,7 @@
 thread_local!(static SILENCE_PANIC: Cell<bool> = Cell::new(false));
 
 #[test]
+#[cfg_attr(any(target_os = "emscripten", target_family = "wasm"), ignore)]
 fn sort_panic_safe() {
     let prev = panic::take_hook();
     panic::set_hook(Box::new(move |info| {
diff --git a/tests/str.rs b/tests/str.rs
index 0e1e35e..f73a261 100644
--- a/tests/str.rs
+++ b/tests/str.rs
@@ -64,6 +64,11 @@
         let parallel: Vec<_> = string.par_split(separator).collect();
         assert_eq!(serial, parallel);
 
+        let pattern: &[char] = &['\u{0}', separator, '\u{1F980}'];
+        let serial: Vec<_> = string.split(pattern).collect();
+        let parallel: Vec<_> = string.par_split(pattern).collect();
+        assert_eq!(serial, parallel);
+
         let serial_fn: Vec<_> = string.split(|c| c == separator).collect();
         let parallel_fn: Vec<_> = string.par_split(|c| c == separator).collect();
         assert_eq!(serial_fn, parallel_fn);
@@ -73,9 +78,12 @@
         let serial: Vec<_> = string.split_terminator(separator).collect();
         let parallel: Vec<_> = string.par_split_terminator(separator).collect();
         assert_eq!(serial, parallel);
-    }
 
-    for &(string, separator) in &tests {
+        let pattern: &[char] = &['\u{0}', separator, '\u{1F980}'];
+        let serial: Vec<_> = string.split_terminator(pattern).collect();
+        let parallel: Vec<_> = string.par_split_terminator(pattern).collect();
+        assert_eq!(serial, parallel);
+
         let serial: Vec<_> = string.split_terminator(|c| c == separator).collect();
         let parallel: Vec<_> = string.par_split_terminator(|c| c == separator).collect();
         assert_eq!(serial, parallel);
@@ -99,6 +107,11 @@
         let parallel: Vec<_> = string.par_matches(separator).collect();
         assert_eq!(serial, parallel);
 
+        let pattern: &[char] = &['\u{0}', separator, '\u{1F980}'];
+        let serial: Vec<_> = string.matches(pattern).collect();
+        let parallel: Vec<_> = string.par_matches(pattern).collect();
+        assert_eq!(serial, parallel);
+
         let serial_fn: Vec<_> = string.matches(|c| c == separator).collect();
         let parallel_fn: Vec<_> = string.par_matches(|c| c == separator).collect();
         assert_eq!(serial_fn, parallel_fn);
@@ -109,6 +122,11 @@
         let parallel: Vec<_> = string.par_match_indices(separator).collect();
         assert_eq!(serial, parallel);
 
+        let pattern: &[char] = &['\u{0}', separator, '\u{1F980}'];
+        let serial: Vec<_> = string.match_indices(pattern).collect();
+        let parallel: Vec<_> = string.par_match_indices(pattern).collect();
+        assert_eq!(serial, parallel);
+
         let serial_fn: Vec<_> = string.match_indices(|c| c == separator).collect();
         let parallel_fn: Vec<_> = string.par_match_indices(|c| c == separator).collect();
         assert_eq!(serial_fn, parallel_fn);