vendor/portable-atomic/src/imp/atomic128/x86_64.rs - toolchain/rustc - Git at Google

 // SPDX-License-Identifier: Apache-2.0 OR MIT

 // Atomic{I,U}128 implementation on x86_64 using CMPXCHG16B (DWCAS).
 //
 // Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
 // this module and use intrinsics.rs instead.
 //
 // Refs:
 // - x86 and amd64 instruction reference https://www.felixcloutier.com/x86
 // - atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
 //
 // Generated asm:
 // - x86_64 (+cmpxchg16b) https://godbolt.org/z/55n54WeKr

 include!("macros.rs");

 #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
 #[path = "../fallback/outline_atomics.rs"]
 mod fallback;

 #[cfg(not(portable_atomic_no_outline_atomics))]
 #[cfg(not(target_env = "sgx"))]
 #[path = "detect/x86_64.rs"]
 mod detect;

 #[cfg(not(portable_atomic_no_asm))]
 use core::arch::asm;
 use core::sync::atomic::Ordering;

 use crate::utils::{Pair, U128};

 // Asserts that the function is called in the correct context.
 macro_rules! debug_assert_cmpxchg16b {
     () => {
         #[cfg(not(any(
             target_feature = "cmpxchg16b",
             portable_atomic_target_feature = "cmpxchg16b",
         )))]
         {
             debug_assert!(detect::detect().has_cmpxchg16b());
         }
     };
 }
 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
 #[cfg(target_feature = "sse")]
 macro_rules! debug_assert_vmovdqa_atomic {
     () => {{
         debug_assert_cmpxchg16b!();
         debug_assert!(detect::detect().has_vmovdqa_atomic());
     }};
 }

 #[allow(unused_macros)]
 #[cfg(target_pointer_width = "32")]
 macro_rules! ptr_modifier {
     () => {
         ":e"
     };
 }
 #[allow(unused_macros)]
 #[cfg(target_pointer_width = "64")]
 macro_rules! ptr_modifier {
     () => {
         ""
     };
 }

 #[cfg_attr(
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
     target_feature(enable = "cmpxchg16b")
 )]
 #[inline]
 unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_cmpxchg16b!();

     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
     // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
     //
     // If the value at `dst` (destination operand) and rdx:rax are equal, the
     // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
     // `dst` is loaded to rdx:rax.
     //
     // The ZF flag is set if the value at `dst` and rdx:rax are equal,
     // otherwise it is cleared. Other flags are unaffected.
     //
     // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
     unsafe {
         // cmpxchg16b is always SeqCst.
         let r: u8;
         let old = U128 { whole: old };
         let new = U128 { whole: new };
         let (prev_lo, prev_hi);
         macro_rules! cmpxchg16b {
             ($rdi:tt) => {
                 asm!(
                     // rbx is reserved by LLVM
                     "xchg {rbx_tmp}, rbx",
                     concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                     "sete r8b",
                     "mov rbx, {rbx_tmp}", // restore rbx
                     rbx_tmp = inout(reg) new.pair.lo => _,
                     in("rcx") new.pair.hi,
                     inout("rax") old.pair.lo => prev_lo,
                     inout("rdx") old.pair.hi => prev_hi,
                     in($rdi) dst,
                     out("r8b") r,
                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
                     options(nostack),
                 )
             };
         }
         #[cfg(target_pointer_width = "32")]
         cmpxchg16b!("edi");
         #[cfg(target_pointer_width = "64")]
         cmpxchg16b!("rdi");
         (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
     }
 }

 // VMOVDQA is atomic on Intel and AMD CPUs with AVX.
 // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
 //
 // Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
 //
 // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
 // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
 #[cfg(target_feature = "sse")]
 #[target_feature(enable = "avx")]
 #[inline]
 unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
     debug_assert!(src as usize % 16 == 0);
     debug_assert_vmovdqa_atomic!();

     // SAFETY: the caller must uphold the safety contract.
     //
     // atomic load by vmovdqa is always SeqCst.
     unsafe {
         let out: core::arch::x86_64::__m128;
         asm!(
             concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
             src = in(reg) src,
             out = out(xmm_reg) out,
             options(nostack, preserves_flags),
         );
         core::mem::transmute(out)
     }
 }
 #[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
 #[cfg(target_feature = "sse")]
 #[target_feature(enable = "avx")]
 #[inline]
 unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_vmovdqa_atomic!();

     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         let val: core::arch::x86_64::__m128 = core::mem::transmute(val);
         match order {
             // Relaxed and Release stores are equivalent.
             Ordering::Relaxed | Ordering::Release => {
                 asm!(
                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
                     dst = in(reg) dst,
                     val = in(xmm_reg) val,
                     options(nostack, preserves_flags),
                 );
             }
             Ordering::SeqCst => {
                 asm!(
                     concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
                     "mfence",
                     dst = in(reg) dst,
                     val = in(xmm_reg) val,
                     options(nostack, preserves_flags),
                 );
             }
             _ => unreachable!("{:?}", order),
         }
     }
 }

 #[cfg(not(all(
     any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
     any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
 )))]
 macro_rules! load_store_detect {
     (
         vmovdqa = $vmovdqa:ident
         cmpxchg16b = $cmpxchg16b:ident
         fallback = $fallback:ident
     ) => {{
         let cpuid = detect::detect();
         #[cfg(not(any(
             target_feature = "cmpxchg16b",
             portable_atomic_target_feature = "cmpxchg16b",
         )))]
         {
             // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
             if cpuid.has_cmpxchg16b() {
                 // We do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
                 #[cfg(target_feature = "sse")]
                 {
                     if cpuid.has_vmovdqa_atomic() {
                         $vmovdqa
                     } else {
                         $cmpxchg16b
                     }
                 }
                 #[cfg(not(target_feature = "sse"))]
                 {
                     $cmpxchg16b
                 }
             } else {
                 fallback::$fallback
             }
         }
         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
         {
             if cpuid.has_vmovdqa_atomic() {
                 $vmovdqa
             } else {
                 $cmpxchg16b
             }
         }
     }};
 }

 #[inline]
 unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
     // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
     // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
     // SGX doesn't support CPUID.
     #[cfg(all(
         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     ))]
     // SAFETY: the caller must uphold the safety contract.
     // cfg guarantees that CMPXCHG16B is available at compile-time.
     unsafe {
         // cmpxchg16b is always SeqCst.
         atomic_load_cmpxchg16b(src)
     }
     #[cfg(not(all(
         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         ifunc!(unsafe fn(src: *mut u128) -> u128 {
             load_store_detect! {
                 vmovdqa = atomic_load_vmovdqa
                 cmpxchg16b = atomic_load_cmpxchg16b
                 // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
                 fallback = atomic_load_seqcst
             }
         })
     }
 }
 #[cfg_attr(
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
     target_feature(enable = "cmpxchg16b")
 )]
 #[inline]
 unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
     debug_assert!(src as usize % 16 == 0);
     debug_assert_cmpxchg16b!();

     // SAFETY: the caller must guarantee that `src` is valid for both writes and
     // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
     // cfg guarantees that the CPU supports CMPXCHG16B.
     //
     // See cmpxchg16b function for more.
     //
     // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
     // omitting the storing of condition flags and avoid use of xchg to handle rbx.
     unsafe {
         // cmpxchg16b is always SeqCst.
         let (out_lo, out_hi);
         macro_rules! cmpxchg16b {
             ($rdi:tt) => {
                 asm!(
                     // rbx is reserved by LLVM
                     "mov {rbx_tmp}, rbx",
                     "xor rbx, rbx", // zeroed rbx
                     concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                     "mov rbx, {rbx_tmp}", // restore rbx
                     // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
                     rbx_tmp = out(reg) _,
                     in("rcx") 0_u64,
                     inout("rax") 0_u64 => out_lo,
                     inout("rdx") 0_u64 => out_hi,
                     in($rdi) src,
                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
                     options(nostack),
                 )
             };
         }
         #[cfg(target_pointer_width = "32")]
         cmpxchg16b!("edi");
         #[cfg(target_pointer_width = "64")]
         cmpxchg16b!("rdi");
         U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
     }
 }

 #[inline]
 unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
     // Do not use vector registers on targets such as x86_64-unknown-none unless SSE is explicitly enabled.
     // https://doc.rust-lang.org/nightly/rustc/platform-support/x86_64-unknown-none.html
     // SGX doesn't support CPUID.
     #[cfg(all(
         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     ))]
     // SAFETY: the caller must uphold the safety contract.
     // cfg guarantees that CMPXCHG16B is available at compile-time.
     unsafe {
         // cmpxchg16b is always SeqCst.
         let _ = order;
         atomic_store_cmpxchg16b(dst, val);
     }
     #[cfg(not(all(
         any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
         any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
     )))]
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         #[cfg(target_feature = "sse")]
         fn_alias! {
             #[target_feature(enable = "avx")]
             unsafe fn(dst: *mut u128, val: u128);
             // atomic store by vmovdqa has at least release semantics.
             atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
             atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
         }
         match order {
             // Relaxed and Release stores are equivalent in all implementations
             // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
             // core::arch's cmpxchg16b will never called here.
             Ordering::Relaxed | Ordering::Release => {
                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
                     load_store_detect! {
                         vmovdqa = atomic_store_vmovdqa_non_seqcst
                         cmpxchg16b = atomic_store_cmpxchg16b
                         fallback = atomic_store_non_seqcst
                     }
                 });
             }
             Ordering::SeqCst => {
                 ifunc!(unsafe fn(dst: *mut u128, val: u128) {
                     load_store_detect! {
                         vmovdqa = atomic_store_vmovdqa_seqcst
                         cmpxchg16b = atomic_store_cmpxchg16b
                         fallback = atomic_store_seqcst
                     }
                 });
             }
             _ => unreachable!("{:?}", order),
         }
     }
 }
 #[cfg_attr(
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
     target_feature(enable = "cmpxchg16b")
 )]
 unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
     // SAFETY: the caller must uphold the safety contract.
     unsafe {
         // cmpxchg16b is always SeqCst.
         atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
     }
 }

 #[inline]
 unsafe fn atomic_compare_exchange(
     dst: *mut u128,
     old: u128,
     new: u128,
     _success: Ordering,
     _failure: Ordering,
 ) -> Result<u128, u128> {
     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
     // and cfg guarantees that CMPXCHG16B is available at compile-time.
     let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
     let (prev, ok) = unsafe {
         ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
             if detect::detect().has_cmpxchg16b() {
                 cmpxchg16b
             } else {
                 // Use SeqCst because cmpxchg16b is always SeqCst.
                 fallback::atomic_compare_exchange_seqcst
             }
         })
     };
     if ok {
         Ok(prev)
     } else {
         Err(prev)
     }
 }

 // cmpxchg16b is always strong.
 use atomic_compare_exchange as atomic_compare_exchange_weak;

 #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
 use atomic_swap_cmpxchg16b as atomic_swap;
 #[cfg_attr(
     not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
     target_feature(enable = "cmpxchg16b")
 )]
 #[inline]
 unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
     debug_assert!(dst as usize % 16 == 0);
     debug_assert_cmpxchg16b!();

     // SAFETY: the caller must guarantee that `dst` is valid for both writes and
     // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
     // cfg guarantees that the CPU supports CMPXCHG16B.
     //
     // See cmpxchg16b function for more.
     //
     // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
     // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
     //
     // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
     unsafe {
         // cmpxchg16b is always SeqCst.
         let val = U128 { whole: val };
         let (mut prev_lo, mut prev_hi);
         macro_rules! cmpxchg16b {
             ($rdi:tt) => {
                 asm!(
                     // rbx is reserved by LLVM
                     "xchg {rbx_tmp}, rbx",
                     // This is not single-copy atomic reads, but this is ok because subsequent
                     // CAS will check for consistency.
                     //
                     // This is based on the code generated for the first load in DW RMWs by LLVM.
                     //
                     // Note that the C++20 memory model does not allow mixed-sized atomic access,
                     // so we must use inline assembly to implement this.
                     // (i.e., byte-wise atomic based on the standard library's atomic types
                     // cannot be used here).
                     concat!("mov rax, qword ptr [", $rdi, "]"),
                     concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                     "2:",
                         concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                         "jne 2b",
                     "mov rbx, {rbx_tmp}", // restore rbx
                     rbx_tmp = inout(reg) val.pair.lo => _,
                     in("rcx") val.pair.hi,
                     out("rax") prev_lo,
                     out("rdx") prev_hi,
                     in($rdi) dst,
                     // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
                     options(nostack),
                 )
             };
         }
         #[cfg(target_pointer_width = "32")]
         cmpxchg16b!("edi");
         #[cfg(target_pointer_width = "64")]
         cmpxchg16b!("rdi");
         U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
     }
 }

 /// Atomic RMW by CAS loop (3 arguments)
 /// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
 ///
 /// `$op` can use the following registers:
 /// - rsi/r8 pair: val argument (read-only for `$op`)
 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
 /// - rbx/rcx pair: new value that will be stored
 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
 // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
 macro_rules! atomic_rmw_cas_3 {
     ($name:ident as $reexport_name:ident, $($op:tt)*) => {
         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
         use $name as $reexport_name;
         #[cfg_attr(
             not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
             target_feature(enable = "cmpxchg16b")
         )]
         #[inline]
         unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             debug_assert_cmpxchg16b!();
             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
             // cfg guarantees that the CPU supports CMPXCHG16B.
             //
             // See cmpxchg16b function for more.
             unsafe {
                 // cmpxchg16b is always SeqCst.
                 let val = U128 { whole: val };
                 let (mut prev_lo, mut prev_hi);
                 macro_rules! cmpxchg16b {
                     ($rdi:tt) => {
                         asm!(
                             // rbx is reserved by LLVM
                             "mov {rbx_tmp}, rbx",
                             // This is not single-copy atomic reads, but this is ok because subsequent
                             // CAS will check for consistency.
                             //
                             // This is based on the code generated for the first load in DW RMWs by LLVM.
                             //
                             // Note that the C++20 memory model does not allow mixed-sized atomic access,
                             // so we must use inline assembly to implement this.
                             // (i.e., byte-wise atomic based on the standard library's atomic types
                             // cannot be used here).
                             concat!("mov rax, qword ptr [", $rdi, "]"),
                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                             "2:",
                                 $($op)*
                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                                 "jne 2b",
                             "mov rbx, {rbx_tmp}", // restore rbx
                             rbx_tmp = out(reg) _,
                             out("rcx") _,
                             out("rax") prev_lo,
                             out("rdx") prev_hi,
                             in($rdi) dst,
                             in("rsi") val.pair.lo,
                             in("r8") val.pair.hi,
                             // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
                             options(nostack),
                         )
                     };
                 }
                 #[cfg(target_pointer_width = "32")]
                 cmpxchg16b!("edi");
                 #[cfg(target_pointer_width = "64")]
                 cmpxchg16b!("rdi");
                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
             }
         }
     };
 }
 /// Atomic RMW by CAS loop (2 arguments)
 /// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
 ///
 /// `$op` can use the following registers:
 /// - rax/rdx pair: previous value loaded (read-only for `$op`)
 /// - rbx/rcx pair: new value that will be stored
 // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
 // omitting the storing of condition flags and avoid use of xchg to handle rbx.
 macro_rules! atomic_rmw_cas_2 {
     ($name:ident as $reexport_name:ident, $($op:tt)*) => {
         #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
         use $name as $reexport_name;
         #[cfg_attr(
             not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")),
             target_feature(enable = "cmpxchg16b")
         )]
         #[inline]
         unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
             debug_assert!(dst as usize % 16 == 0);
             debug_assert_cmpxchg16b!();
             // SAFETY: the caller must guarantee that `dst` is valid for both writes and
             // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
             // cfg guarantees that the CPU supports CMPXCHG16B.
             //
             // See cmpxchg16b function for more.
             unsafe {
                 // cmpxchg16b is always SeqCst.
                 let (mut prev_lo, mut prev_hi);
                 macro_rules! cmpxchg16b {
                     ($rdi:tt) => {
                         asm!(
                             // rbx is reserved by LLVM
                             "mov {rbx_tmp}, rbx",
                             // This is not single-copy atomic reads, but this is ok because subsequent
                             // CAS will check for consistency.
                             //
                             // This is based on the code generated for the first load in DW RMWs by LLVM.
                             //
                             // Note that the C++20 memory model does not allow mixed-sized atomic access,
                             // so we must use inline assembly to implement this.
                             // (i.e., byte-wise atomic based on the standard library's atomic types
                             // cannot be used here).
                             concat!("mov rax, qword ptr [", $rdi, "]"),
                             concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
                             "2:",
                                 $($op)*
                                 concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
                                 "jne 2b",
                             "mov rbx, {rbx_tmp}", // restore rbx
                             rbx_tmp = out(reg) _,
                             out("rcx") _,
                             out("rax") prev_lo,
                             out("rdx") prev_hi,
                             in($rdi) dst,
                             // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
                             options(nostack),
                         )
                     };
                 }
                 #[cfg(target_pointer_width = "32")]
                 cmpxchg16b!("edi");
                 #[cfg(target_pointer_width = "64")]
                 cmpxchg16b!("rdi");
                 U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
             }
         }
     };
 }

 atomic_rmw_cas_3! {
     atomic_add_cmpxchg16b as atomic_add,
     "mov rbx, rax",
     "add rbx, rsi",
     "mov rcx, rdx",
     "adc rcx, r8",
 }
 atomic_rmw_cas_3! {
     atomic_sub_cmpxchg16b as atomic_sub,
     "mov rbx, rax",
     "sub rbx, rsi",
     "mov rcx, rdx",
     "sbb rcx, r8",
 }
 atomic_rmw_cas_3! {
     atomic_and_cmpxchg16b as atomic_and,
     "mov rbx, rax",
     "and rbx, rsi",
     "mov rcx, rdx",
     "and rcx, r8",
 }
 atomic_rmw_cas_3! {
     atomic_nand_cmpxchg16b as atomic_nand,
     "mov rbx, rax",
     "and rbx, rsi",
     "not rbx",
     "mov rcx, rdx",
     "and rcx, r8",
     "not rcx",
 }
 atomic_rmw_cas_3! {
     atomic_or_cmpxchg16b as atomic_or,
     "mov rbx, rax",
     "or rbx, rsi",
     "mov rcx, rdx",
     "or rcx, r8",
 }
 atomic_rmw_cas_3! {
     atomic_xor_cmpxchg16b as atomic_xor,
     "mov rbx, rax",
     "xor rbx, rsi",
     "mov rcx, rdx",
     "xor rcx, r8",
 }

 atomic_rmw_cas_2! {
     atomic_not_cmpxchg16b as atomic_not,
     "mov rbx, rax",
     "not rbx",
     "mov rcx, rdx",
     "not rcx",
 }
 atomic_rmw_cas_2! {
     atomic_neg_cmpxchg16b as atomic_neg,
     "mov rbx, rax",
     "neg rbx",
     "mov rcx, 0",
     "sbb rcx, rdx",
 }

 atomic_rmw_cas_3! {
     atomic_max_cmpxchg16b as atomic_max,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
     "mov rcx, r8",
     "cmovl rcx, rdx",
     "mov rbx, rsi",
     "cmovl rbx, rax",
 }
 atomic_rmw_cas_3! {
     atomic_umax_cmpxchg16b as atomic_umax,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
     "mov rcx, r8",
     "cmovb rcx, rdx",
     "mov rbx, rsi",
     "cmovb rbx, rax",
 }
 atomic_rmw_cas_3! {
     atomic_min_cmpxchg16b as atomic_min,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
     "mov rcx, r8",
     "cmovge rcx, rdx",
     "mov rbx, rsi",
     "cmovge rbx, rax",
 }
 atomic_rmw_cas_3! {
     atomic_umin_cmpxchg16b as atomic_umin,
     "cmp rsi, rax",
     "mov rcx, r8",
     "sbb rcx, rdx",
     "mov rcx, r8",
     "cmovae rcx, rdx",
     "mov rbx, rsi",
     "cmovae rbx, rax",
 }

 macro_rules! atomic_rmw_with_ifunc {
     (
         unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
         cmpxchg16b = $cmpxchg16b_fn:ident;
         fallback = $seqcst_fallback_fn:ident;
     ) => {
         #[cfg(not(any(
             target_feature = "cmpxchg16b",
             portable_atomic_target_feature = "cmpxchg16b",
         )))]
         #[inline]
         unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
             fn_alias! {
                 #[cfg_attr(
                     not(any(
                         target_feature = "cmpxchg16b",
                         portable_atomic_target_feature = "cmpxchg16b",
                     )),
                     target_feature(enable = "cmpxchg16b")
                 )]
                 unsafe fn($($arg)*) $(-> $ret_ty)?;
                 // cmpxchg16b is always SeqCst.
                 cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
             }
             // SAFETY: the caller must uphold the safety contract.
             // we only calls cmpxchg16b_fn if cmpxchg16b is available.
             unsafe {
                 ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
                     if detect::detect().has_cmpxchg16b() {
                         cmpxchg16b_seqcst_fn
                     } else {
                         // Use SeqCst because cmpxchg16b is always SeqCst.
                         fallback::$seqcst_fallback_fn
                     }
                 })
             }
         }
     };
 }

 atomic_rmw_with_ifunc! {
     unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_swap_cmpxchg16b;
     fallback = atomic_swap_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_add_cmpxchg16b;
     fallback = atomic_add_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_sub_cmpxchg16b;
     fallback = atomic_sub_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_and_cmpxchg16b;
     fallback = atomic_and_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_nand_cmpxchg16b;
     fallback = atomic_nand_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_or_cmpxchg16b;
     fallback = atomic_or_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_xor_cmpxchg16b;
     fallback = atomic_xor_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_max_cmpxchg16b;
     fallback = atomic_max_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_umax_cmpxchg16b;
     fallback = atomic_umax_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_min_cmpxchg16b;
     fallback = atomic_min_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
     cmpxchg16b = atomic_umin_cmpxchg16b;
     fallback = atomic_umin_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_not(dst: *mut u128) -> u128;
     cmpxchg16b = atomic_not_cmpxchg16b;
     fallback = atomic_not_seqcst;
 }
 atomic_rmw_with_ifunc! {
     unsafe fn atomic_neg(dst: *mut u128) -> u128;
     cmpxchg16b = atomic_neg_cmpxchg16b;
     fallback = atomic_neg_seqcst;
 }

 #[inline]
 fn is_lock_free() -> bool {
     #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
     {
         // CMPXCHG16B is available at compile-time.
         true
     }
     #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
     {
         detect::detect().has_cmpxchg16b()
     }
 }
 const IS_ALWAYS_LOCK_FREE: bool =
     cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));

 atomic128!(AtomicI128, i128, atomic_max, atomic_min);
 atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);

 #[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
 #[cfg(test)]
 mod tests {
     use super::*;

     test_atomic_int!(i128);
     test_atomic_int!(u128);

     // load/store/swap implementation is not affected by signedness, so it is
     // enough to test only unsigned types.
     stress_test!(u128);
 }