| // Implementation adapted from mbedtls. |
| |
| use core::arch::{aarch64::*, asm}; |
| |
| use crate::consts::K64; |
| |
| cpufeatures::new!(sha3_hwcap, "sha3"); |
| |
| pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
| // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 |
| // after stabilization |
| if sha3_hwcap::get() { |
| unsafe { sha512_compress(state, blocks) } |
| } else { |
| super::soft::compress(state, blocks); |
| } |
| } |
| |
| #[target_feature(enable = "sha3")] |
| unsafe fn sha512_compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
| // SAFETY: Requires the sha3 feature. |
| |
| // Load state into vectors. |
| let mut ab = vld1q_u64(state[0..2].as_ptr()); |
| let mut cd = vld1q_u64(state[2..4].as_ptr()); |
| let mut ef = vld1q_u64(state[4..6].as_ptr()); |
| let mut gh = vld1q_u64(state[6..8].as_ptr()); |
| |
| // Iterate through the message blocks. |
| for block in blocks { |
| // Keep original state values. |
| let ab_orig = ab; |
| let cd_orig = cd; |
| let ef_orig = ef; |
| let gh_orig = gh; |
| |
| // Load the message block into vectors, assuming little endianness. |
| let mut s0 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[0..16].as_ptr()))); |
| let mut s1 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[16..32].as_ptr()))); |
| let mut s2 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[32..48].as_ptr()))); |
| let mut s3 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[48..64].as_ptr()))); |
| let mut s4 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[64..80].as_ptr()))); |
| let mut s5 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[80..96].as_ptr()))); |
| let mut s6 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[96..112].as_ptr()))); |
| let mut s7 = vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block[112..128].as_ptr()))); |
| |
| // Rounds 0 and 1 |
| let mut initial_sum = vaddq_u64(s0, vld1q_u64(&K64[0])); |
| let mut sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh); |
| let mut intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); |
| gh = vsha512h2q_u64(intermed, cd, ab); |
| cd = vaddq_u64(cd, intermed); |
| |
| // Rounds 2 and 3 |
| initial_sum = vaddq_u64(s1, vld1q_u64(&K64[2])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef); |
| intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); |
| ef = vsha512h2q_u64(intermed, ab, gh); |
| ab = vaddq_u64(ab, intermed); |
| |
| // Rounds 4 and 5 |
| initial_sum = vaddq_u64(s2, vld1q_u64(&K64[4])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd); |
| intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); |
| cd = vsha512h2q_u64(intermed, gh, ef); |
| gh = vaddq_u64(gh, intermed); |
| |
| // Rounds 6 and 7 |
| initial_sum = vaddq_u64(s3, vld1q_u64(&K64[6])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab); |
| intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); |
| ab = vsha512h2q_u64(intermed, ef, cd); |
| ef = vaddq_u64(ef, intermed); |
| |
| // Rounds 8 and 9 |
| initial_sum = vaddq_u64(s4, vld1q_u64(&K64[8])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh); |
| intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); |
| gh = vsha512h2q_u64(intermed, cd, ab); |
| cd = vaddq_u64(cd, intermed); |
| |
| // Rounds 10 and 11 |
| initial_sum = vaddq_u64(s5, vld1q_u64(&K64[10])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef); |
| intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); |
| ef = vsha512h2q_u64(intermed, ab, gh); |
| ab = vaddq_u64(ab, intermed); |
| |
| // Rounds 12 and 13 |
| initial_sum = vaddq_u64(s6, vld1q_u64(&K64[12])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd); |
| intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); |
| cd = vsha512h2q_u64(intermed, gh, ef); |
| gh = vaddq_u64(gh, intermed); |
| |
| // Rounds 14 and 15 |
| initial_sum = vaddq_u64(s7, vld1q_u64(&K64[14])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab); |
| intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); |
| ab = vsha512h2q_u64(intermed, ef, cd); |
| ef = vaddq_u64(ef, intermed); |
| |
| for t in (16..80).step_by(16) { |
| // Rounds t and t + 1 |
| s0 = vsha512su1q_u64(vsha512su0q_u64(s0, s1), s7, vextq_u64(s4, s5, 1)); |
| initial_sum = vaddq_u64(s0, vld1q_u64(&K64[t])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh); |
| intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); |
| gh = vsha512h2q_u64(intermed, cd, ab); |
| cd = vaddq_u64(cd, intermed); |
| |
| // Rounds t + 2 and t + 3 |
| s1 = vsha512su1q_u64(vsha512su0q_u64(s1, s2), s0, vextq_u64(s5, s6, 1)); |
| initial_sum = vaddq_u64(s1, vld1q_u64(&K64[t + 2])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef); |
| intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); |
| ef = vsha512h2q_u64(intermed, ab, gh); |
| ab = vaddq_u64(ab, intermed); |
| |
| // Rounds t + 4 and t + 5 |
| s2 = vsha512su1q_u64(vsha512su0q_u64(s2, s3), s1, vextq_u64(s6, s7, 1)); |
| initial_sum = vaddq_u64(s2, vld1q_u64(&K64[t + 4])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd); |
| intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); |
| cd = vsha512h2q_u64(intermed, gh, ef); |
| gh = vaddq_u64(gh, intermed); |
| |
| // Rounds t + 6 and t + 7 |
| s3 = vsha512su1q_u64(vsha512su0q_u64(s3, s4), s2, vextq_u64(s7, s0, 1)); |
| initial_sum = vaddq_u64(s3, vld1q_u64(&K64[t + 6])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab); |
| intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); |
| ab = vsha512h2q_u64(intermed, ef, cd); |
| ef = vaddq_u64(ef, intermed); |
| |
| // Rounds t + 8 and t + 9 |
| s4 = vsha512su1q_u64(vsha512su0q_u64(s4, s5), s3, vextq_u64(s0, s1, 1)); |
| initial_sum = vaddq_u64(s4, vld1q_u64(&K64[t + 8])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), gh); |
| intermed = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1)); |
| gh = vsha512h2q_u64(intermed, cd, ab); |
| cd = vaddq_u64(cd, intermed); |
| |
| // Rounds t + 10 and t + 11 |
| s5 = vsha512su1q_u64(vsha512su0q_u64(s5, s6), s4, vextq_u64(s1, s2, 1)); |
| initial_sum = vaddq_u64(s5, vld1q_u64(&K64[t + 10])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ef); |
| intermed = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1)); |
| ef = vsha512h2q_u64(intermed, ab, gh); |
| ab = vaddq_u64(ab, intermed); |
| |
| // Rounds t + 12 and t + 13 |
| s6 = vsha512su1q_u64(vsha512su0q_u64(s6, s7), s5, vextq_u64(s2, s3, 1)); |
| initial_sum = vaddq_u64(s6, vld1q_u64(&K64[t + 12])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), cd); |
| intermed = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1)); |
| cd = vsha512h2q_u64(intermed, gh, ef); |
| gh = vaddq_u64(gh, intermed); |
| |
| // Rounds t + 14 and t + 15 |
| s7 = vsha512su1q_u64(vsha512su0q_u64(s7, s0), s6, vextq_u64(s3, s4, 1)); |
| initial_sum = vaddq_u64(s7, vld1q_u64(&K64[t + 14])); |
| sum = vaddq_u64(vextq_u64(initial_sum, initial_sum, 1), ab); |
| intermed = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1)); |
| ab = vsha512h2q_u64(intermed, ef, cd); |
| ef = vaddq_u64(ef, intermed); |
| } |
| |
| // Add the block-specific state to the original state. |
| ab = vaddq_u64(ab, ab_orig); |
| cd = vaddq_u64(cd, cd_orig); |
| ef = vaddq_u64(ef, ef_orig); |
| gh = vaddq_u64(gh, gh_orig); |
| } |
| |
| // Store vectors into state. |
| vst1q_u64(state[0..2].as_mut_ptr(), ab); |
| vst1q_u64(state[2..4].as_mut_ptr(), cd); |
| vst1q_u64(state[4..6].as_mut_ptr(), ef); |
| vst1q_u64(state[6..8].as_mut_ptr(), gh); |
| } |
| |
| // TODO remove these polyfills once SHA3 intrinsics land |
| |
| #[inline(always)] |
| unsafe fn vsha512hq_u64( |
| mut hash_ed: uint64x2_t, |
| hash_gf: uint64x2_t, |
| kwh_kwh2: uint64x2_t, |
| ) -> uint64x2_t { |
| asm!( |
| "SHA512H {:q}, {:q}, {:v}.2D", |
| inout(vreg) hash_ed, in(vreg) hash_gf, in(vreg) kwh_kwh2, |
| options(pure, nomem, nostack, preserves_flags) |
| ); |
| hash_ed |
| } |
| |
| #[inline(always)] |
| unsafe fn vsha512h2q_u64( |
| mut sum_ab: uint64x2_t, |
| hash_c_: uint64x2_t, |
| hash_ab: uint64x2_t, |
| ) -> uint64x2_t { |
| asm!( |
| "SHA512H2 {:q}, {:q}, {:v}.2D", |
| inout(vreg) sum_ab, in(vreg) hash_c_, in(vreg) hash_ab, |
| options(pure, nomem, nostack, preserves_flags) |
| ); |
| sum_ab |
| } |
| |
| #[inline(always)] |
| unsafe fn vsha512su0q_u64(mut w0_1: uint64x2_t, w2_: uint64x2_t) -> uint64x2_t { |
| asm!( |
| "SHA512SU0 {:v}.2D, {:v}.2D", |
| inout(vreg) w0_1, in(vreg) w2_, |
| options(pure, nomem, nostack, preserves_flags) |
| ); |
| w0_1 |
| } |
| |
| #[inline(always)] |
| unsafe fn vsha512su1q_u64( |
| mut s01_s02: uint64x2_t, |
| w14_15: uint64x2_t, |
| w9_10: uint64x2_t, |
| ) -> uint64x2_t { |
| asm!( |
| "SHA512SU1 {:v}.2D, {:v}.2D, {:v}.2D", |
| inout(vreg) s01_s02, in(vreg) w14_15, in(vreg) w9_10, |
| options(pure, nomem, nostack, preserves_flags) |
| ); |
| s01_s02 |
| } |