vendor/sha2-0.10.7/src/sha256/aarch64.rs - toolchain/rustc - Git at Google

 //! SHA-256 `aarch64` backend.

 // Implementation adapted from mbedtls.

 // TODO: stdarch intrinsics: RustCrypto/hashes#257

 use core::arch::{aarch64::*, asm};

 use crate::consts::K32;

 cpufeatures::new!(sha2_hwcap, "sha2");

 pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
     // after stabilization
     if sha2_hwcap::get() {
         unsafe { sha256_compress(state, blocks) }
     } else {
         super::soft::compress(state, blocks);
     }
 }

 #[target_feature(enable = "sha2")]
 unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
     // SAFETY: Requires the sha2 feature.

     // Load state into vectors.
     let mut abcd = vld1q_u32(state[0..4].as_ptr());
     let mut efgh = vld1q_u32(state[4..8].as_ptr());

     // Iterate through the message blocks.
     for block in blocks {
         // Keep original state values.
         let abcd_orig = abcd;
         let efgh_orig = efgh;

         // Load the message block into vectors, assuming little endianness.
         let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
         let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
         let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
         let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));

         // Rounds 0 to 3
         let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
         let mut abcd_prev = abcd;
         abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
         efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

         // Rounds 4 to 7
         tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
         abcd_prev = abcd;
         abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
         efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

         // Rounds 8 to 11
         tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
         abcd_prev = abcd;
         abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
         efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

         // Rounds 12 to 15
         tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
         abcd_prev = abcd;
         abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
         efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

         for t in (16..64).step_by(16) {
             // Rounds t to t + 3
             s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
             tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
             abcd_prev = abcd;
             abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
             efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

             // Rounds t + 4 to t + 7
             s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
             tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
             abcd_prev = abcd;
             abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
             efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

             // Rounds t + 8 to t + 11
             s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
             tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
             abcd_prev = abcd;
             abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
             efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

             // Rounds t + 12 to t + 15
             s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
             tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
             abcd_prev = abcd;
             abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
             efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
         }

         // Add the block-specific state to the original state.
         abcd = vaddq_u32(abcd, abcd_orig);
         efgh = vaddq_u32(efgh, efgh_orig);
     }

     // Store vectors into state.
     vst1q_u32(state[0..4].as_mut_ptr(), abcd);
     vst1q_u32(state[4..8].as_mut_ptr(), efgh);
 }

 // TODO remove these polyfills once SHA2 intrinsics land

 #[inline(always)]
 unsafe fn vsha256hq_u32(
     mut hash_efgh: uint32x4_t,
     hash_abcd: uint32x4_t,
     wk: uint32x4_t,
 ) -> uint32x4_t {
     asm!(
         "SHA256H {:q}, {:q}, {:v}.4S",
         inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
         options(pure, nomem, nostack, preserves_flags)
     );
     hash_efgh
 }

 #[inline(always)]
 unsafe fn vsha256h2q_u32(
     mut hash_efgh: uint32x4_t,
     hash_abcd: uint32x4_t,
     wk: uint32x4_t,
 ) -> uint32x4_t {
     asm!(
         "SHA256H2 {:q}, {:q}, {:v}.4S",
         inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
         options(pure, nomem, nostack, preserves_flags)
     );
     hash_efgh
 }

 #[inline(always)]
 unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
     asm!(
         "SHA256SU0 {:v}.4S, {:v}.4S",
         inout(vreg) w0_3, in(vreg) w4_7,
         options(pure, nomem, nostack, preserves_flags)
     );
     w0_3
 }

 #[inline(always)]
 unsafe fn vsha256su1q_u32(
     mut tw0_3: uint32x4_t,
     w8_11: uint32x4_t,
     w12_15: uint32x4_t,
 ) -> uint32x4_t {
     asm!(
         "SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
         inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
         options(pure, nomem, nostack, preserves_flags)
     );
     tw0_3
 }
	//! SHA-256 `aarch64` backend.

	// Implementation adapted from mbedtls.

	// TODO: stdarch intrinsics: RustCrypto/hashes#257

	use core::arch::{aarch64::*, asm};

	use crate::consts::K32;

	cpufeatures::new!(sha2_hwcap, "sha2");

	pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
	// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
	// after stabilization
	if sha2_hwcap::get() {
	unsafe { sha256_compress(state, blocks) }
	} else {
	super::soft::compress(state, blocks);
	}
	}

	#[target_feature(enable = "sha2")]
	unsafe fn sha256_compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) {
	// SAFETY: Requires the sha2 feature.

	// Load state into vectors.
	let mut abcd = vld1q_u32(state[0..4].as_ptr());
	let mut efgh = vld1q_u32(state[4..8].as_ptr());

	// Iterate through the message blocks.
	for block in blocks {
	// Keep original state values.
	let abcd_orig = abcd;
	let efgh_orig = efgh;

	// Load the message block into vectors, assuming little endianness.
	let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[0..16].as_ptr())));
	let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[16..32].as_ptr())));
	let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[32..48].as_ptr())));
	let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(block[48..64].as_ptr())));

	// Rounds 0 to 3
	let mut tmp = vaddq_u32(s0, vld1q_u32(&K32[0]));
	let mut abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds 4 to 7
	tmp = vaddq_u32(s1, vld1q_u32(&K32[4]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds 8 to 11
	tmp = vaddq_u32(s2, vld1q_u32(&K32[8]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds 12 to 15
	tmp = vaddq_u32(s3, vld1q_u32(&K32[12]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	for t in (16..64).step_by(16) {
	// Rounds t to t + 3
	s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
	tmp = vaddq_u32(s0, vld1q_u32(&K32[t]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds t + 4 to t + 7
	s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
	tmp = vaddq_u32(s1, vld1q_u32(&K32[t + 4]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds t + 8 to t + 11
	s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
	tmp = vaddq_u32(s2, vld1q_u32(&K32[t + 8]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);

	// Rounds t + 12 to t + 15
	s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
	tmp = vaddq_u32(s3, vld1q_u32(&K32[t + 12]));
	abcd_prev = abcd;
	abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
	efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
	}

	// Add the block-specific state to the original state.
	abcd = vaddq_u32(abcd, abcd_orig);
	efgh = vaddq_u32(efgh, efgh_orig);
	}

	// Store vectors into state.
	vst1q_u32(state[0..4].as_mut_ptr(), abcd);
	vst1q_u32(state[4..8].as_mut_ptr(), efgh);
	}

	// TODO remove these polyfills once SHA2 intrinsics land

	#[inline(always)]
	unsafe fn vsha256hq_u32(
	mut hash_efgh: uint32x4_t,
	hash_abcd: uint32x4_t,
	wk: uint32x4_t,
	) -> uint32x4_t {
	asm!(
	"SHA256H {:q}, {:q}, {:v}.4S",
	inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
	options(pure, nomem, nostack, preserves_flags)
	);
	hash_efgh
	}

	#[inline(always)]
	unsafe fn vsha256h2q_u32(
	mut hash_efgh: uint32x4_t,
	hash_abcd: uint32x4_t,
	wk: uint32x4_t,
	) -> uint32x4_t {
	asm!(
	"SHA256H2 {:q}, {:q}, {:v}.4S",
	inout(vreg) hash_efgh, in(vreg) hash_abcd, in(vreg) wk,
	options(pure, nomem, nostack, preserves_flags)
	);
	hash_efgh
	}

	#[inline(always)]
	unsafe fn vsha256su0q_u32(mut w0_3: uint32x4_t, w4_7: uint32x4_t) -> uint32x4_t {
	asm!(
	"SHA256SU0 {:v}.4S, {:v}.4S",
	inout(vreg) w0_3, in(vreg) w4_7,
	options(pure, nomem, nostack, preserves_flags)
	);
	w0_3
	}

	#[inline(always)]
	unsafe fn vsha256su1q_u32(
	mut tw0_3: uint32x4_t,
	w8_11: uint32x4_t,
	w12_15: uint32x4_t,
	) -> uint32x4_t {
	asm!(
	"SHA256SU1 {:v}.4S, {:v}.4S, {:v}.4S",
	inout(vreg) tw0_3, in(vreg) w8_11, in(vreg) w12_15,
	options(pure, nomem, nostack, preserves_flags)
	);
	tw0_3
	}