blob: 0de4350cb23746ccd6afb936f57a4d54983abb11 [file] [log] [blame]
// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build (ppc64 || ppc64le) && !purego
#include "textflag.h"
// func xorBytes(dst, a, b *byte, n int)
TEXT ·xorBytes(SB), NOSPLIT, $0
MOVD dst+0(FP), R3 // R3 = dst
MOVD a+8(FP), R4 // R4 = a
MOVD b+16(FP), R5 // R5 = b
MOVD n+24(FP), R6 // R6 = n
CMPU R6, $64, CR7 // Check if n ≥ 64 bytes
MOVD R0, R8 // R8 = index
CMPU R6, $8, CR6 // Check if 8 ≤ n < 64 bytes
BLE CR6, small // <= 8
BLT CR7, xor32 // Case for 32 ≤ n < 64 bytes
// Case for n ≥ 64 bytes
preloop64:
SRD $6, R6, R7 // Set up loop counter
MOVD R7, CTR
MOVD $16, R10
MOVD $32, R14
MOVD $48, R15
ANDCC $63, R6, R9 // Check for tailing bytes for later
PCALIGN $16
// Case for >= 64 bytes
// Process 64 bytes per iteration
// Load 4 vectors of a and b
// XOR the corresponding vectors
// from a and b and store the result
loop64:
LXVD2X (R4)(R8), VS32
LXVD2X (R4)(R10), VS34
LXVD2X (R4)(R14), VS36
LXVD2X (R4)(R15), VS38
LXVD2X (R5)(R8), VS33
LXVD2X (R5)(R10), VS35
LXVD2X (R5)(R14), VS37
LXVD2X (R5)(R15), VS39
XXLXOR VS32, VS33, VS32
XXLXOR VS34, VS35, VS34
XXLXOR VS36, VS37, VS36
XXLXOR VS38, VS39, VS38
STXVD2X VS32, (R3)(R8)
STXVD2X VS34, (R3)(R10)
STXVD2X VS36, (R3)(R14)
STXVD2X VS38, (R3)(R15)
ADD $64, R8
ADD $64, R10
ADD $64, R14
ADD $64, R15
BDNZ loop64
BC 12,2,LR // BEQLR
MOVD R9, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 64 bytes
// Process 32 bytes if available
xor32:
CMP R6, $32
BLT xor16
ADD $16, R8, R9
LXVD2X (R4)(R8), VS32
LXVD2X (R4)(R9), VS33
LXVD2X (R5)(R8), VS34
LXVD2X (R5)(R9), VS35
XXLXOR VS32, VS34, VS32
XXLXOR VS33, VS35, VS33
STXVD2X VS32, (R3)(R8)
STXVD2X VS33, (R3)(R9)
ADD $32, R8
ADD $-32, R6
CMP R6, $8
BLE small
// Case for 8 <= n < 32 bytes
// Process 16 bytes if available
xor16:
CMP R6, $16
BLT xor8
LXVD2X (R4)(R8), VS32
LXVD2X (R5)(R8), VS33
XXLXOR VS32, VS33, VS32
STXVD2X VS32, (R3)(R8)
ADD $16, R8
ADD $-16, R6
small:
CMP R6, R0
BC 12,2,LR // BEQLR
xor8:
#ifdef GOPPC64_power10
SLD $56,R6,R17
ADD R4,R8,R18
ADD R5,R8,R19
ADD R3,R8,R20
LXVL R18,R17,V0
LXVL R19,R17,V1
VXOR V0,V1,V1
STXVL V1,R20,R17
RET
#else
CMP R6, $8
BLT xor4
// Case for 8 ≤ n < 16 bytes
MOVD (R4)(R8), R14 // R14 = a[i,...,i+7]
MOVD (R5)(R8), R15 // R15 = b[i,...,i+7]
XOR R14, R15, R16 // R16 = a[] ^ b[]
SUB $8, R6 // n = n - 8
MOVD R16, (R3)(R8) // Store to dst
ADD $8, R8
xor4:
CMP R6, $4
BLT xor2
MOVWZ (R4)(R8), R14
MOVWZ (R5)(R8), R15
XOR R14, R15, R16
MOVW R16, (R3)(R8)
ADD $4,R8
ADD $-4,R6
xor2:
CMP R6, $2
BLT xor1
MOVHZ (R4)(R8), R14
MOVHZ (R5)(R8), R15
XOR R14, R15, R16
MOVH R16, (R3)(R8)
ADD $2,R8
ADD $-2,R6
xor1:
CMP R6, R0
BC 12,2,LR // BEQLR
MOVBZ (R4)(R8), R14 // R14 = a[i]
MOVBZ (R5)(R8), R15 // R15 = b[i]
XOR R14, R15, R16 // R16 = a[i] ^ b[i]
MOVB R16, (R3)(R8) // Store to dst
#endif
done:
RET