diff options
Diffstat (limited to '')
-rw-r--r-- | lib/cryptopp/rijndael.cpp | 1261 |
1 files changed, 0 insertions, 1261 deletions
diff --git a/lib/cryptopp/rijndael.cpp b/lib/cryptopp/rijndael.cpp deleted file mode 100644 index c185032cf..000000000 --- a/lib/cryptopp/rijndael.cpp +++ /dev/null @@ -1,1261 +0,0 @@ -// rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu> -// and Wei Dai from Paulo Baretto's Rijndael implementation -// The original code and all modifications are in the public domain. - -// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code - -/* -July 2010: Added support for AES-NI instructions via compiler intrinsics. -*/ - -/* -Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode -caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein -and Peter Schwabe in their paper "New AES software speed records". The round -function was also modified to include a trick similar to one in Brian Gladman's -x86 assembly code, doing an 8-bit register move to minimize the number of -register spills. Also switched to compressed tables and copying round keys to -the stack. - -The C++ implementation now uses compressed tables if -CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined. -*/ - -/* -July 2006: Defense against timing attacks was added in by Wei Dai. - -The code now uses smaller tables in the first and last rounds, -and preloads them into L1 cache before usage (by loading at least -one element in each cache line). - -We try to delay subsequent accesses to each table (used in the first -and last rounds) until all of the table has been preloaded. Hopefully -the compiler isn't smart enough to optimize that code away. - -After preloading the table, we also try not to access any memory location -other than the table and the stack, in order to prevent table entries from -being unloaded from L1 cache, until that round is finished. -(Some popular CPUs have 2-way associative caches.) -*/ - -// This is the original introductory comment: - -/** - * version 3.0 (December 2000) - * - * Optimised ANSI C code for the Rijndael cipher (now AES) - * - * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> - * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> - * author Paulo Barreto <paulo.barreto@terra.com.br> - * - * This code is hereby placed in the public domain. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS - * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE - * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, - * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "pch.h" - -#ifndef CRYPTOPP_IMPORTS -#ifndef CRYPTOPP_GENERATE_X64_MASM - -#include "rijndael.h" -#include "misc.h" -#include "cpu.h" - -NAMESPACE_BEGIN(CryptoPP) - -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) -namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];} -using namespace rdtable; -#else -static word64 Te[256]; -#endif -static word64 Td[256]; -#else -static word32 Te[256*4], Td[256*4]; -#endif -static volatile bool s_TeFilled = false, s_TdFilled = false; - -// ************************* Portable Code ************************************ - -#define QUARTER_ROUND(L, T, t, a, b, c, d) \ - a ^= L(T, 3, byte(t)); t >>= 8;\ - b ^= L(T, 2, byte(t)); t >>= 8;\ - c ^= L(T, 1, byte(t)); t >>= 8;\ - d ^= L(T, 0, t); - -#define QUARTER_ROUND_LE(t, a, b, c, d) \ - tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ - tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ - tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ - tempBlock[d] = ((byte *)(Te+t))[1]; - -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - #define QUARTER_ROUND_LD(t, a, b, c, d) \ - tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ - tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ - tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ - tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7]; -#else - #define QUARTER_ROUND_LD(t, a, b, c, d) \ - tempBlock[a] = Sd[byte(t)]; t >>= 8;\ - tempBlock[b] = Sd[byte(t)]; t >>= 8;\ - tempBlock[c] = Sd[byte(t)]; t >>= 8;\ - tempBlock[d] = Sd[t]; -#endif - -#define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d) -#define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d) - -#ifdef IS_LITTLE_ENDIAN - #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) - #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) - #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1)) - #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1)) - #else - #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) - #define TL_M(T, i, x) T[i*256 + x] - #endif -#else - #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) - #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) - #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4)) - #define TL_M TL_F - #else - #define TL_F(T, i, x) rotrFixed(T[x], i*8) - #define TL_M(T, i, x) T[i*256 + x] - #endif -#endif - - -#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b)) -#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b)) -#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b)) - -#define f3(x) (f2(x) ^ x) -#define f9(x) (f8(x) ^ x) -#define fb(x) (f8(x) ^ f2(x) ^ x) -#define fd(x) (f8(x) ^ f4(x) ^ x) -#define fe(x) (f8(x) ^ f4(x) ^ f2(x)) - -void Rijndael::Base::FillEncTable() -{ - for (int i=0; i<256; i++) - { - byte x = Se[i]; -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; - Te[i] = word64(y | f3(x))<<32 | y; -#else - word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; - for (int j=0; j<4; j++) - { - Te[i+j*256] = y; - y = rotrFixed(y, 8); - } -#endif - } -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) - Te[256] = Te[257] = 0; -#endif - s_TeFilled = true; -} - -void Rijndael::Base::FillDecTable() -{ - for (int i=0; i<256; i++) - { - byte x = Sd[i]; -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; - Td[i] = word64(y | fb(x))<<32 | y | x; -#else - word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;; - for (int j=0; j<4; j++) - { - Td[i+j*256] = y; - y = rotrFixed(y, 8); - } -#endif - } - s_TdFilled = true; -} - -void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &) -{ - AssertValidKeyLength(keylen); - - m_rounds = keylen/4 + 6; - m_key.New(4*(m_rounds+1)); - - word32 *rk = m_key; - -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86) - // MSVC 2008 SP1 generates bad code for _mm_extract_epi32() when compiling for X64 - if (HasAESNI()) - { - static const word32 rcLE[] = { - 0x01, 0x02, 0x04, 0x08, - 0x10, 0x20, 0x40, 0x80, - 0x1B, 0x36, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ - }; - const word32 *rc = rcLE; - - __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16)); - memcpy(rk, userKey, keylen); - - while (true) - { - rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; - - if (rk + keylen/4 + 4 == m_key.end()) - break; - - if (keylen == 24) - { - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - temp = _mm_insert_epi32(temp, rk[11], 3); - } - else if (keylen == 32) - { - temp = _mm_insert_epi32(temp, rk[11], 3); - rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2); - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - temp = _mm_insert_epi32(temp, rk[15], 3); - } - else - temp = _mm_insert_epi32(temp, rk[7], 3); - - rk += keylen/4; - } - - if (!IsForwardTransformation()) - { - rk = m_key; - unsigned int i, j; - - std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds)); - - for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) - { - temp = _mm_aesimc_si128(*(__m128i *)(rk+i)); - *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j)); - *(__m128i *)(rk+j) = temp; - } - - *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i)); - } - - return; - } -#endif - - GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); - const word32 *rc = rcon; - word32 temp; - - while (true) - { - temp = rk[keylen/4-1]; - word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)]; - rk[keylen/4] = rk[0] ^ x ^ *(rc++); - rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; - rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; - rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; - - if (rk + keylen/4 + 4 == m_key.end()) - break; - - if (keylen == 24) - { - rk[10] = rk[ 4] ^ rk[ 9]; - rk[11] = rk[ 5] ^ rk[10]; - } - else if (keylen == 32) - { - temp = rk[11]; - rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; - rk[13] = rk[ 5] ^ rk[12]; - rk[14] = rk[ 6] ^ rk[13]; - rk[15] = rk[ 7] ^ rk[14]; - } - rk += keylen/4; - } - - rk = m_key; - - if (IsForwardTransformation()) - { - if (!s_TeFilled) - FillEncTable(); - - ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16); - ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16); - } - else - { - if (!s_TdFilled) - FillDecTable(); - - unsigned int i, j; - -#define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) - - for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4) - { - temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp; - temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp; - temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp; - temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp; - } - - rk[i+0] = InverseMixColumn(rk[i+0]); - rk[i+1] = InverseMixColumn(rk[i+1]); - rk[i+2] = InverseMixColumn(rk[i+2]); - rk[i+3] = InverseMixColumn(rk[i+3]); - - temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp; - temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp; - temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp; - temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp; - } - -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - if (HasAESNI()) - ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16); -#endif -} - -void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const -{ -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) - if (HasSSE2()) -#else - if (HasAESNI()) -#endif - { - Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); - return; - } -#endif - - typedef BlockGetAndPut<word32, NativeByteOrder> Block; - - word32 s0, s1, s2, s3, t0, t1, t2, t3; - Block::Get(inBlock)(s0)(s1)(s2)(s3); - - const word32 *rk = m_key; - s0 ^= rk[0]; - s1 ^= rk[1]; - s2 ^= rk[2]; - s3 ^= rk[3]; - t0 = rk[4]; - t1 = rk[5]; - t2 = rk[6]; - t3 = rk[7]; - rk += 8; - - // timing attack countermeasure. see comments at top for more details - const int cacheLineSize = GetCacheLineSize(); - unsigned int i; - word32 u = 0; -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - for (i=0; i<2048; i+=cacheLineSize) -#else - for (i=0; i<1024; i+=cacheLineSize) -#endif - u &= *(const word32 *)(((const byte *)Te)+i); - u &= Te[255]; - s0 |= u; s1 |= u; s2 |= u; s3 |= u; - - QUARTER_ROUND_FE(s3, t0, t1, t2, t3) - QUARTER_ROUND_FE(s2, t3, t0, t1, t2) - QUARTER_ROUND_FE(s1, t2, t3, t0, t1) - QUARTER_ROUND_FE(s0, t1, t2, t3, t0) - - // Nr - 2 full rounds: - unsigned int r = m_rounds/2 - 1; - do - { - s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; - - QUARTER_ROUND_E(t3, s0, s1, s2, s3) - QUARTER_ROUND_E(t2, s3, s0, s1, s2) - QUARTER_ROUND_E(t1, s2, s3, s0, s1) - QUARTER_ROUND_E(t0, s1, s2, s3, s0) - - t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; - - QUARTER_ROUND_E(s3, t0, t1, t2, t3) - QUARTER_ROUND_E(s2, t3, t0, t1, t2) - QUARTER_ROUND_E(s1, t2, t3, t0, t1) - QUARTER_ROUND_E(s0, t1, t2, t3, t0) - - rk += 8; - } while (--r); - - word32 tbw[4]; - byte *const tempBlock = (byte *)tbw; - - QUARTER_ROUND_LE(t2, 15, 2, 5, 8) - QUARTER_ROUND_LE(t1, 11, 14, 1, 4) - QUARTER_ROUND_LE(t0, 7, 10, 13, 0) - QUARTER_ROUND_LE(t3, 3, 6, 9, 12) - - Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); -} - -void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const -{ -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - if (HasAESNI()) - { - Rijndael::Dec::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); - return; - } -#endif - - typedef BlockGetAndPut<word32, NativeByteOrder> Block; - - word32 s0, s1, s2, s3, t0, t1, t2, t3; - Block::Get(inBlock)(s0)(s1)(s2)(s3); - - const word32 *rk = m_key; - s0 ^= rk[0]; - s1 ^= rk[1]; - s2 ^= rk[2]; - s3 ^= rk[3]; - t0 = rk[4]; - t1 = rk[5]; - t2 = rk[6]; - t3 = rk[7]; - rk += 8; - - // timing attack countermeasure. see comments at top for more details - const int cacheLineSize = GetCacheLineSize(); - unsigned int i; - word32 u = 0; -#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - for (i=0; i<2048; i+=cacheLineSize) -#else - for (i=0; i<1024; i+=cacheLineSize) -#endif - u &= *(const word32 *)(((const byte *)Td)+i); - u &= Td[255]; - s0 |= u; s1 |= u; s2 |= u; s3 |= u; - - QUARTER_ROUND_FD(s3, t2, t1, t0, t3) - QUARTER_ROUND_FD(s2, t1, t0, t3, t2) - QUARTER_ROUND_FD(s1, t0, t3, t2, t1) - QUARTER_ROUND_FD(s0, t3, t2, t1, t0) - - // Nr - 2 full rounds: - unsigned int r = m_rounds/2 - 1; - do - { - s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; - - QUARTER_ROUND_D(t3, s2, s1, s0, s3) - QUARTER_ROUND_D(t2, s1, s0, s3, s2) - QUARTER_ROUND_D(t1, s0, s3, s2, s1) - QUARTER_ROUND_D(t0, s3, s2, s1, s0) - - t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; - - QUARTER_ROUND_D(s3, t2, t1, t0, t3) - QUARTER_ROUND_D(s2, t1, t0, t3, t2) - QUARTER_ROUND_D(s1, t0, t3, t2, t1) - QUARTER_ROUND_D(s0, t3, t2, t1, t0) - - rk += 8; - } while (--r); - -#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS - // timing attack countermeasure. see comments at top for more details - // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, - // QUARTER_ROUND_LD will use Td, which is already preloaded. - u = 0; - for (i=0; i<256; i+=cacheLineSize) - u &= *(const word32 *)(Sd+i); - u &= *(const word32 *)(Sd+252); - t0 |= u; t1 |= u; t2 |= u; t3 |= u; -#endif - - word32 tbw[4]; - byte *const tempBlock = (byte *)tbw; - - QUARTER_ROUND_LD(t2, 7, 2, 13, 8) - QUARTER_ROUND_LD(t1, 3, 14, 9, 4) - QUARTER_ROUND_LD(t0, 15, 10, 5, 0) - QUARTER_ROUND_LD(t3, 11, 6, 1, 12) - - Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]); -} - -// ************************* Assembly Code ************************************ - -#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code - -#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM - -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE - -CRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k) -{ -#if CRYPTOPP_BOOL_X86 - -#define L_REG esp -#define L_INDEX(i) (L_REG+768+i) -#define L_INXORBLOCKS L_INBLOCKS+4 -#define L_OUTXORBLOCKS L_INBLOCKS+8 -#define L_OUTBLOCKS L_INBLOCKS+12 -#define L_INCREMENTS L_INDEX(16*15) -#define L_SP L_INDEX(16*16) -#define L_LENGTH L_INDEX(16*16+4) -#define L_KEYS_BEGIN L_INDEX(16*16+8) - -#define MOVD movd -#define MM(i) mm##i - -#define MXOR(a,b,c) \ - AS2( movzx esi, b)\ - AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - AS2( pxor MM(a), mm7)\ - -#define MMOV(a,b,c) \ - AS2( movzx esi, b)\ - AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - -#else - -#define L_REG r8 -#define L_INDEX(i) (L_REG+i) -#define L_INXORBLOCKS L_INBLOCKS+8 -#define L_OUTXORBLOCKS L_INBLOCKS+16 -#define L_OUTBLOCKS L_INBLOCKS+24 -#define L_INCREMENTS L_INDEX(16*16) -#define L_LENGTH L_INDEX(16*18+8) -#define L_KEYS_BEGIN L_INDEX(16*19) - -#define MOVD mov -#define MM_0 r9d -#define MM_1 r12d -#ifdef __GNUC__ -#define MM_2 r11d -#else -#define MM_2 r10d -#endif -#define MM(i) MM_##i - -#define MXOR(a,b,c) \ - AS2( movzx esi, b)\ - AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - -#define MMOV(a,b,c) \ - AS2( movzx esi, b)\ - AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - -#endif - -#define L_SUBKEYS L_INDEX(0) -#define L_SAVED_X L_SUBKEYS -#define L_KEY12 L_INDEX(16*12) -#define L_LASTROUND L_INDEX(16*13) -#define L_INBLOCKS L_INDEX(16*14) -#define MAP0TO4(i) (ASM_MOD(i+3,4)+1) - -#define XOR(a,b,c) \ - AS2( movzx esi, b)\ - AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - -#define MOV(a,b,c) \ - AS2( movzx esi, b)\ - AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ - -#ifdef CRYPTOPP_GENERATE_X64_MASM - ALIGN 8 - Rijndael_Enc_AdvancedProcessBlocks PROC FRAME - rex_push_reg rsi - push_reg rdi - push_reg rbx - push_reg r12 - .endprolog - mov L_REG, rcx - mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA - mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA] -#elif defined(__GNUC__) - __asm__ __volatile__ - ( - ".intel_syntax noprefix;" - #if CRYPTOPP_BOOL_X64 - AS2( mov L_REG, rcx) - #endif - AS_PUSH_IF86(bx) - AS_PUSH_IF86(bp) - AS2( mov AS_REG_7, WORD_REG(si)) -#else - AS_PUSH_IF86(si) - AS_PUSH_IF86(di) - AS_PUSH_IF86(bx) - AS_PUSH_IF86(bp) - AS2( lea AS_REG_7, [Te]) - AS2( mov edi, [g_cacheLineSize]) -#endif - -#if CRYPTOPP_BOOL_X86 - AS2( mov [ecx+16*12+16*4], esp) // save esp to L_SP - AS2( lea esp, [ecx-768]) -#endif - - // copy subkeys to stack - AS2( mov WORD_REG(si), [L_KEYS_BEGIN]) - AS2( mov WORD_REG(ax), 16) - AS2( and WORD_REG(ax), WORD_REG(si)) - AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)]) // subkey 1 (non-counter) or 2 (counter) - AS2( movdqa [L_KEY12], xmm3) - AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16]) - AS2( sub WORD_REG(ax), WORD_REG(si)) - ASL(0) - AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)]) - AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0) - AS2( add WORD_REG(si), 16) - AS2( cmp WORD_REG(si), 16*12) - ASJ( jl, 0, b) - - // read subkeys 0, 1 and last - AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)]) // last subkey - AS2( movdqa xmm1, [WORD_REG(dx)]) // subkey 0 - AS2( MOVD MM(1), [WORD_REG(dx)+4*4]) // 0,1,2,3 - AS2( mov ebx, [WORD_REG(dx)+5*4]) // 4,5,6,7 - AS2( mov ecx, [WORD_REG(dx)+6*4]) // 8,9,10,11 - AS2( mov edx, [WORD_REG(dx)+7*4]) // 12,13,14,15 - - // load table into cache - AS2( xor WORD_REG(ax), WORD_REG(ax)) - ASL(9) - AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) - AS2( add WORD_REG(ax), WORD_REG(di)) - AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) - AS2( add WORD_REG(ax), WORD_REG(di)) - AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) - AS2( add WORD_REG(ax), WORD_REG(di)) - AS2( mov esi, [AS_REG_7+WORD_REG(ax)]) - AS2( add WORD_REG(ax), WORD_REG(di)) - AS2( cmp WORD_REG(ax), 2048) - ASJ( jl, 9, b) - AS1( lfence) - - AS2( test DWORD PTR [L_LENGTH], 1) - ASJ( jz, 8, f) - - // counter mode one-time setup - AS2( mov WORD_REG(si), [L_INBLOCKS]) - AS2( movdqu xmm2, [WORD_REG(si)]) // counter - AS2( pxor xmm2, xmm1) - AS2( psrldq xmm1, 14) - AS2( movd eax, xmm1) - AS2( mov al, BYTE PTR [WORD_REG(si)+15]) - AS2( MOVD MM(2), eax) -#if CRYPTOPP_BOOL_X86 - AS2( mov eax, 1) - AS2( movd mm3, eax) -#endif - - // partial first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: mm1, ebx, ecx, edx - AS2( movd eax, xmm2) - AS2( psrldq xmm2, 4) - AS2( movd edi, xmm2) - AS2( psrldq xmm2, 4) - MXOR( 1, al, 0) // 0 - XOR( edx, ah, 1) // 1 - AS2( shr eax, 16) - XOR( ecx, al, 2) // 2 - XOR( ebx, ah, 3) // 3 - AS2( mov eax, edi) - AS2( movd edi, xmm2) - AS2( psrldq xmm2, 4) - XOR( ebx, al, 0) // 4 - MXOR( 1, ah, 1) // 5 - AS2( shr eax, 16) - XOR( edx, al, 2) // 6 - XOR( ecx, ah, 3) // 7 - AS2( mov eax, edi) - AS2( movd edi, xmm2) - XOR( ecx, al, 0) // 8 - XOR( ebx, ah, 1) // 9 - AS2( shr eax, 16) - MXOR( 1, al, 2) // 10 - XOR( edx, ah, 3) // 11 - AS2( mov eax, edi) - XOR( edx, al, 0) // 12 - XOR( ecx, ah, 1) // 13 - AS2( shr eax, 16) - XOR( ebx, al, 2) // 14 - AS2( psrldq xmm2, 3) - - // partial second round, in: ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15), out: eax, ebx, edi, mm0 - AS2( mov eax, [L_KEY12+0*4]) - AS2( mov edi, [L_KEY12+2*4]) - AS2( MOVD MM(0), [L_KEY12+3*4]) - MXOR( 0, cl, 3) /* 11 */ - XOR( edi, bl, 3) /* 7 */ - MXOR( 0, bh, 2) /* 6 */ - AS2( shr ebx, 16) /* 4,5 */ - XOR( eax, bl, 1) /* 5 */ - MOV( ebx, bh, 0) /* 4 */ - AS2( xor ebx, [L_KEY12+1*4]) - XOR( eax, ch, 2) /* 10 */ - AS2( shr ecx, 16) /* 8,9 */ - XOR( eax, dl, 3) /* 15 */ - XOR( ebx, dh, 2) /* 14 */ - AS2( shr edx, 16) /* 12,13 */ - XOR( edi, ch, 0) /* 8 */ - XOR( ebx, cl, 1) /* 9 */ - XOR( edi, dl, 1) /* 13 */ - MXOR( 0, dh, 0) /* 12 */ - - AS2( movd ecx, xmm2) - AS2( MOVD edx, MM(1)) - AS2( MOVD [L_SAVED_X+3*4], MM(0)) - AS2( mov [L_SAVED_X+0*4], eax) - AS2( mov [L_SAVED_X+1*4], ebx) - AS2( mov [L_SAVED_X+2*4], edi) - ASJ( jmp, 5, f) - - ASL(3) - // non-counter mode per-block setup - AS2( MOVD MM(1), [L_KEY12+0*4]) // 0,1,2,3 - AS2( mov ebx, [L_KEY12+1*4]) // 4,5,6,7 - AS2( mov ecx, [L_KEY12+2*4]) // 8,9,10,11 - AS2( mov edx, [L_KEY12+3*4]) // 12,13,14,15 - ASL(8) - AS2( mov WORD_REG(ax), [L_INBLOCKS]) - AS2( movdqu xmm2, [WORD_REG(ax)]) - AS2( mov WORD_REG(si), [L_INXORBLOCKS]) - AS2( movdqu xmm5, [WORD_REG(si)]) - AS2( pxor xmm2, xmm1) - AS2( pxor xmm2, xmm5) - - // first round, in: xmm2(15,14,13,12;11,10,9,8;7,6,5,4;3,2,1,0), out: eax, ebx, ecx, edx - AS2( movd eax, xmm2) - AS2( psrldq xmm2, 4) - AS2( movd edi, xmm2) - AS2( psrldq xmm2, 4) - MXOR( 1, al, 0) // 0 - XOR( edx, ah, 1) // 1 - AS2( shr eax, 16) - XOR( ecx, al, 2) // 2 - XOR( ebx, ah, 3) // 3 - AS2( mov eax, edi) - AS2( movd edi, xmm2) - AS2( psrldq xmm2, 4) - XOR( ebx, al, 0) // 4 - MXOR( 1, ah, 1) // 5 - AS2( shr eax, 16) - XOR( edx, al, 2) // 6 - XOR( ecx, ah, 3) // 7 - AS2( mov eax, edi) - AS2( movd edi, xmm2) - XOR( ecx, al, 0) // 8 - XOR( ebx, ah, 1) // 9 - AS2( shr eax, 16) - MXOR( 1, al, 2) // 10 - XOR( edx, ah, 3) // 11 - AS2( mov eax, edi) - XOR( edx, al, 0) // 12 - XOR( ecx, ah, 1) // 13 - AS2( shr eax, 16) - XOR( ebx, al, 2) // 14 - MXOR( 1, ah, 3) // 15 - AS2( MOVD eax, MM(1)) - - AS2( add L_REG, [L_KEYS_BEGIN]) - AS2( add L_REG, 4*16) - ASJ( jmp, 2, f) - - ASL(1) - // counter-mode per-block setup - AS2( MOVD ecx, MM(2)) - AS2( MOVD edx, MM(1)) - AS2( mov eax, [L_SAVED_X+0*4]) - AS2( mov ebx, [L_SAVED_X+1*4]) - AS2( xor cl, ch) - AS2( and WORD_REG(cx), 255) - ASL(5) -#if CRYPTOPP_BOOL_X86 - AS2( paddb MM(2), mm3) -#else - AS2( add MM(2), 1) -#endif - // remaining part of second round, in: edx(previous round),esi(keyed counter byte) eax,ebx,[L_SAVED_X+2*4],[L_SAVED_X+3*4], out: eax,ebx,ecx,edx - AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3]) - XOR( ebx, dl, 3) - MOV( ecx, dh, 2) - AS2( shr edx, 16) - AS2( xor ecx, [L_SAVED_X+2*4]) - XOR( eax, dh, 0) - MOV( edx, dl, 1) - AS2( xor edx, [L_SAVED_X+3*4]) - - AS2( add L_REG, [L_KEYS_BEGIN]) - AS2( add L_REG, 3*16) - ASJ( jmp, 4, f) - -// in: eax(0,1,2,3), ebx(4,5,6,7), ecx(8,9,10,11), edx(12,13,14,15) -// out: eax, ebx, edi, mm0 -#define ROUND() \ - MXOR( 0, cl, 3) /* 11 */\ - AS2( mov cl, al) /* 8,9,10,3 */\ - XOR( edi, ah, 2) /* 2 */\ - AS2( shr eax, 16) /* 0,1 */\ - XOR( edi, bl, 3) /* 7 */\ - MXOR( 0, bh, 2) /* 6 */\ - AS2( shr ebx, 16) /* 4,5 */\ - MXOR( 0, al, 1) /* 1 */\ - MOV( eax, ah, 0) /* 0 */\ - XOR( eax, bl, 1) /* 5 */\ - MOV( ebx, bh, 0) /* 4 */\ - XOR( eax, ch, 2) /* 10 */\ - XOR( ebx, cl, 3) /* 3 */\ - AS2( shr ecx, 16) /* 8,9 */\ - XOR( eax, dl, 3) /* 15 */\ - XOR( ebx, dh, 2) /* 14 */\ - AS2( shr edx, 16) /* 12,13 */\ - XOR( edi, ch, 0) /* 8 */\ - XOR( ebx, cl, 1) /* 9 */\ - XOR( edi, dl, 1) /* 13 */\ - MXOR( 0, dh, 0) /* 12 */\ - - ASL(2) // 2-round loop - AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4]) - AS2( mov edi, [L_SUBKEYS-4*16+2*4]) - ROUND() - AS2( mov ecx, edi) - AS2( xor eax, [L_SUBKEYS-4*16+0*4]) - AS2( xor ebx, [L_SUBKEYS-4*16+1*4]) - AS2( MOVD edx, MM(0)) - - ASL(4) - AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4]) - AS2( mov edi, [L_SUBKEYS-4*16+6*4]) - ROUND() - AS2( mov ecx, edi) - AS2( xor eax, [L_SUBKEYS-4*16+4*4]) - AS2( xor ebx, [L_SUBKEYS-4*16+5*4]) - AS2( MOVD edx, MM(0)) - - AS2( add L_REG, 32) - AS2( test L_REG, 255) - ASJ( jnz, 2, b) - AS2( sub L_REG, 16*16) - -#define LAST(a, b, c) \ - AS2( movzx esi, a )\ - AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\ - AS2( movzx esi, b )\ - AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\ - AS2( mov WORD PTR [L_LASTROUND+c], di )\ - - // last round - LAST(ch, dl, 2) - LAST(dh, al, 6) - AS2( shr edx, 16) - LAST(ah, bl, 10) - AS2( shr eax, 16) - LAST(bh, cl, 14) - AS2( shr ebx, 16) - LAST(dh, al, 12) - AS2( shr ecx, 16) - LAST(ah, bl, 0) - LAST(bh, cl, 4) - LAST(ch, dl, 8) - - AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS]) - AS2( mov WORD_REG(bx), [L_OUTBLOCKS]) - - AS2( mov WORD_REG(cx), [L_LENGTH]) - AS2( sub WORD_REG(cx), 16) - - AS2( movdqu xmm2, [WORD_REG(ax)]) - AS2( pxor xmm2, xmm4) - -#if CRYPTOPP_BOOL_X86 - AS2( movdqa xmm0, [L_INCREMENTS]) - AS2( paddd xmm0, [L_INBLOCKS]) - AS2( movdqa [L_INBLOCKS], xmm0) -#else - AS2( movdqa xmm0, [L_INCREMENTS+16]) - AS2( paddq xmm0, [L_INBLOCKS+16]) - AS2( movdqa [L_INBLOCKS+16], xmm0) -#endif - - AS2( pxor xmm2, [L_LASTROUND]) - AS2( movdqu [WORD_REG(bx)], xmm2) - - ASJ( jle, 7, f) - AS2( mov [L_LENGTH], WORD_REG(cx)) - AS2( test WORD_REG(cx), 1) - ASJ( jnz, 1, b) -#if CRYPTOPP_BOOL_X64 - AS2( movdqa xmm0, [L_INCREMENTS]) - AS2( paddq xmm0, [L_INBLOCKS]) - AS2( movdqa [L_INBLOCKS], xmm0) -#endif - ASJ( jmp, 3, b) - - ASL(7) - // erase keys on stack - AS2( xorps xmm0, xmm0) - AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16]) - AS2( movaps [WORD_REG(ax)-7*16], xmm0) - AS2( movaps [WORD_REG(ax)-6*16], xmm0) - AS2( movaps [WORD_REG(ax)-5*16], xmm0) - AS2( movaps [WORD_REG(ax)-4*16], xmm0) - AS2( movaps [WORD_REG(ax)-3*16], xmm0) - AS2( movaps [WORD_REG(ax)-2*16], xmm0) - AS2( movaps [WORD_REG(ax)-1*16], xmm0) - AS2( movaps [WORD_REG(ax)+0*16], xmm0) - AS2( movaps [WORD_REG(ax)+1*16], xmm0) - AS2( movaps [WORD_REG(ax)+2*16], xmm0) - AS2( movaps [WORD_REG(ax)+3*16], xmm0) - AS2( movaps [WORD_REG(ax)+4*16], xmm0) - AS2( movaps [WORD_REG(ax)+5*16], xmm0) - AS2( movaps [WORD_REG(ax)+6*16], xmm0) -#if CRYPTOPP_BOOL_X86 - AS2( mov esp, [L_SP]) - AS1( emms) -#endif - AS_POP_IF86(bp) - AS_POP_IF86(bx) -#if defined(_MSC_VER) && CRYPTOPP_BOOL_X86 - AS_POP_IF86(di) - AS_POP_IF86(si) - AS1(ret) -#endif -#ifdef CRYPTOPP_GENERATE_X64_MASM - pop r12 - pop rbx - pop rdi - pop rsi - ret - Rijndael_Enc_AdvancedProcessBlocks ENDP -#endif -#ifdef __GNUC__ - ".att_syntax prefix;" - : - : "c" (locals), "d" (k), "S" (Te), "D" (g_cacheLineSize) - : "memory", "cc", "%eax" - #if CRYPTOPP_BOOL_X64 - , "%rbx", "%r8", "%r9", "%r10", "%r11", "%r12" - #endif - ); -#endif -} - -#endif - -#ifndef CRYPTOPP_GENERATE_X64_MASM - -#ifdef CRYPTOPP_X64_MASM_AVAILABLE -extern "C" { -void Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k); -} -#endif - -#if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86 - -static inline bool AliasedWithTable(const byte *begin, const byte *end) -{ - size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096; - size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+sizeof(Te))%4096; - if (t1 > t0) - return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1); - else - return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0); -} - -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - -inline void AESNI_Enc_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds) -{ - block = _mm_xor_si128(block, subkeys[0]); - for (unsigned int i=1; i<rounds-1; i+=2) - { - block = _mm_aesenc_si128(block, subkeys[i]); - block = _mm_aesenc_si128(block, subkeys[i+1]); - } - block = _mm_aesenc_si128(block, subkeys[rounds-1]); - block = _mm_aesenclast_si128(block, subkeys[rounds]); -} - -inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds) -{ - __m128i rk = subkeys[0]; - block0 = _mm_xor_si128(block0, rk); - block1 = _mm_xor_si128(block1, rk); - block2 = _mm_xor_si128(block2, rk); - block3 = _mm_xor_si128(block3, rk); - for (unsigned int i=1; i<rounds; i++) - { - rk = subkeys[i]; - block0 = _mm_aesenc_si128(block0, rk); - block1 = _mm_aesenc_si128(block1, rk); - block2 = _mm_aesenc_si128(block2, rk); - block3 = _mm_aesenc_si128(block3, rk); - } - rk = subkeys[rounds]; - block0 = _mm_aesenclast_si128(block0, rk); - block1 = _mm_aesenclast_si128(block1, rk); - block2 = _mm_aesenclast_si128(block2, rk); - block3 = _mm_aesenclast_si128(block3, rk); -} - -inline void AESNI_Dec_Block(__m128i &block, const __m128i *subkeys, unsigned int rounds) -{ - block = _mm_xor_si128(block, subkeys[0]); - for (unsigned int i=1; i<rounds-1; i+=2) - { - block = _mm_aesdec_si128(block, subkeys[i]); - block = _mm_aesdec_si128(block, subkeys[i+1]); - } - block = _mm_aesdec_si128(block, subkeys[rounds-1]); - block = _mm_aesdeclast_si128(block, subkeys[rounds]); -} - -inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3, const __m128i *subkeys, unsigned int rounds) -{ - __m128i rk = subkeys[0]; - block0 = _mm_xor_si128(block0, rk); - block1 = _mm_xor_si128(block1, rk); - block2 = _mm_xor_si128(block2, rk); - block3 = _mm_xor_si128(block3, rk); - for (unsigned int i=1; i<rounds; i++) - { - rk = subkeys[i]; - block0 = _mm_aesdec_si128(block0, rk); - block1 = _mm_aesdec_si128(block1, rk); - block2 = _mm_aesdec_si128(block2, rk); - block3 = _mm_aesdec_si128(block3, rk); - } - rk = subkeys[rounds]; - block0 = _mm_aesdeclast_si128(block0, rk); - block1 = _mm_aesdeclast_si128(block1, rk); - block2 = _mm_aesdeclast_si128(block2, rk); - block3 = _mm_aesdeclast_si128(block3, rk); -} - -static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24}; - -template <typename F1, typename F4> -inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4, const __m128i *subkeys, unsigned int rounds, const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) -{ - size_t blockSize = 16; - size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize; - size_t xorIncrement = xorBlocks ? blockSize : 0; - size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize; - - if (flags & BlockTransformation::BT_ReverseDirection) - { - assert(length % blockSize == 0); - inBlocks += length - blockSize; - xorBlocks += length - blockSize; - outBlocks += length - blockSize; - inIncrement = 0-inIncrement; - xorIncrement = 0-xorIncrement; - outIncrement = 0-outIncrement; - } - - if (flags & BlockTransformation::BT_AllowParallel) - { - while (length >= 4*blockSize) - { - __m128i block0 = _mm_loadu_si128((const __m128i *)inBlocks), block1, block2, block3; - if (flags & BlockTransformation::BT_InBlockIsCounter) - { - const __m128i be1 = *(const __m128i *)s_one; - block1 = _mm_add_epi32(block0, be1); - block2 = _mm_add_epi32(block1, be1); - block3 = _mm_add_epi32(block2, be1); - _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1)); - } - else - { - inBlocks += inIncrement; - block1 = _mm_loadu_si128((const __m128i *)inBlocks); - inBlocks += inIncrement; - block2 = _mm_loadu_si128((const __m128i *)inBlocks); - inBlocks += inIncrement; - block3 = _mm_loadu_si128((const __m128i *)inBlocks); - inBlocks += inIncrement; - } - - if (flags & BlockTransformation::BT_XorInput) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - } - - func4(block0, block1, block2, block3, subkeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - { - block0 = _mm_xor_si128(block0, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block1 = _mm_xor_si128(block1, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block2 = _mm_xor_si128(block2, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - block3 = _mm_xor_si128(block3, _mm_loadu_si128((const __m128i *)xorBlocks)); - xorBlocks += xorIncrement; - } - - _mm_storeu_si128((__m128i *)outBlocks, block0); - outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)outBlocks, block1); - outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)outBlocks, block2); - outBlocks += outIncrement; - _mm_storeu_si128((__m128i *)outBlocks, block3); - outBlocks += outIncrement; - - length -= 4*blockSize; - } - } - - while (length >= blockSize) - { - __m128i block = _mm_loadu_si128((const __m128i *)inBlocks); - - if (flags & BlockTransformation::BT_XorInput) - block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); - - if (flags & BlockTransformation::BT_InBlockIsCounter) - const_cast<byte *>(inBlocks)[15]++; - - func1(block, subkeys, rounds); - - if (xorBlocks && !(flags & BlockTransformation::BT_XorInput)) - block = _mm_xor_si128(block, _mm_loadu_si128((const __m128i *)xorBlocks)); - - _mm_storeu_si128((__m128i *)outBlocks, block); - - inBlocks += inIncrement; - outBlocks += outIncrement; - xorBlocks += xorIncrement; - length -= blockSize; - } - - return length; -} -#endif - -size_t Rijndael::Enc::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const -{ -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - if (HasAESNI()) - return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); -#endif - -#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) - if (HasSSE2()) - { - if (length < BLOCKSIZE) - return length; - - struct Locals - { - word32 subkeys[4*12], workspace[8]; - const byte *inBlocks, *inXorBlocks, *outXorBlocks; - byte *outBlocks; - size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement; - size_t regSpill, lengthAndCounterFlag, keysBegin; - }; - - size_t increment = BLOCKSIZE; - const byte* zeros = (byte *)(Te+256); - byte *space; - - do { - space = (byte *)alloca(255+sizeof(Locals)); - space += (256-(size_t)space%256)%256; - } - while (AliasedWithTable(space, space+sizeof(Locals))); - - if (flags & BT_ReverseDirection) - { - assert(length % BLOCKSIZE == 0); - inBlocks += length - BLOCKSIZE; - xorBlocks += length - BLOCKSIZE; - outBlocks += length - BLOCKSIZE; - increment = 0-increment; - } - - Locals &locals = *(Locals *)space; - - locals.inBlocks = inBlocks; - locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros; - locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks; - locals.outBlocks = outBlocks; - - locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; - locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0; - locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment; - locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment; - - locals.lengthAndCounterFlag = length - (length%16) - bool(flags & BT_InBlockIsCounter); - int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2); - locals.keysBegin = (12-keysToCopy)*16; - - Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key); - return length % BLOCKSIZE; - } -#endif - - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} - -#endif - -#if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - -size_t Rijndael::Dec::AdvancedProcessBlocks(const byte *inBlocks, const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags) const -{ - if (HasAESNI()) - return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags); - - return BlockTransformation::AdvancedProcessBlocks(inBlocks, xorBlocks, outBlocks, length, flags); -} - -#endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE - -NAMESPACE_END - -#endif -#endif |