我正在尝试使用内在函数在 ARMv8 上插入 AES 实现。我有一个 C++ 实现,我有一个 Intel 内在函数实现。
实现应该是等效的,所以我试图使用英特尔作为 ARMv8 的蓝图。有一些差异,但它们被考虑在内。问题是,我得到了不同的结果。
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)
uint8x16_t data = vld1q_u8(in);
// AES encryption with ARM intrinsics:
// rnds-1 (9 for AES128) cycles of AES:
// (Add, Shift, Sub) plus Mix Columns
unsigned int i;
for (i=0; i<rounds; ++i)
{
// AES single round encryption
data = vaeseq_u8(data, rdkeys[i]);
// AES mix columns
data = vaesmcq_u8(data);
}
// One round of encryption: AES, no Mix Columns
data = vaeseq_u8(data, rdkeys[i++]);
// Final Add (bitwise Xor)
data = veorq_u8(data, rdkeys[i]);
vst1q_u8(out, data);
#elif defined(__AES__)
__m128i data = _mm_loadu_si128((const __m128i*)in);
data = _mm_xor_si128(data, rdkeys[0]);
for (unsigned int i=1; i<rounds-1; ++i)
{
data = _mm_aesenc_si128(data, rdkeys[i]);
}
data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
data = _mm_aesenclast_si128(data, rdkeys[rounds]);
_mm_storeu_si128((__m128i*)out, data);
#endif
}
在这一点上,我试图侧步子 key 计算。我对这两种实现使用相同的一组轮 key :
#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif
// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));
但是,我得出了不同的结果。以下是转储产生的内容:
英特尔 AES-NI :
In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
...
Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
ARMv8 AES :
In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9
...
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9
我一直为结果而挠头。添加更多 printf 无助于识别问题。我开始认为 Intel 和 ARM 内在函数使用不同的子 key 计划。
ARM 和 Intel 内在函数是否对 AES 使用相同的子 key 计划?
下图来自 paper by Cynthia Crutchfield 。它检查 Intel 内在函数和 ARM 内在函数的映射。
下面是完整的程序。还列出了构建它们的命令行。
英特尔 :
g++ -Wall -maes aes-test.cxx -o aes-test.exe
AEMv8 :
g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe
程序 :
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#if defined(__ARM_FEATURE_CRYPTO)
# include <arm_neon.h>
# include <arm_acle.h>
#elif defined(__AES__)
# include <wmmintrin.h>
# include <emmintrin.h>
#endif
#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif
#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0]))
static const unsigned int ROUNDS=10;
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void Print(const char* label, const Byte *in, size_t len, bool lf=false)
{
if (label)
printf("%s: ", label);
for (size_t i=0; in && i<len; ++i)
printf("%02X ", in[i]);
printf("\n");
if (lf)
printf("\n");
}
int main(int argc, char* argv[])
{
Byte cipher[16], recover[16];
const Byte plain[16] = {
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
};
// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i]));
AES_encrypt(plain, cipher, rdkeys, ROUNDS);
return 0;
}
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
Print("In", in, 16);
#if defined(__ARM_FEATURE_CRYPTO)
// Load the block
uint8x16_t data = vld1q_u8(in);
Print("Data (in)", (Byte*)&data, 16, true);
// AES encryption with ARM intrinsics:
// rnds-1 (9 for AES128) cycles of AES:
// (Add, Shift, Sub) plus Mix Columns
unsigned int i;
for (i=0; i<rounds; ++i)
{
// AES single round encryption
data = vaeseq_u8(data, rdkeys[i]);
// AES mix columns
data = vaesmcq_u8(data);
Print("Key", (Byte*)&rdkeys[i], 16);
Print("Data", (Byte*)&data, 16, true);
}
Print("Key", (Byte*)&rdkeys[i], 16);
// One round of encryption: AES, no Mix Columns
data = vaeseq_u8(data, rdkeys[i++]);
Print("Data", (Byte*)&data, 16, true);
// Final Add (bitwise Xor)
data = veorq_u8(data, rdkeys[i]);
Print("Data (xor)", (Byte*)&data, 16);
// Store the output data
vst1q_u8(out, data);
#elif defined(__AES__)
__m128i data = _mm_loadu_si128((const __m128i*)in);
Print("Data (in)", (Byte*)&data, 16);
data = _mm_xor_si128(data, rdkeys[0]);
Print("Key", (Byte*)&rdkeys[0], 16);
Print("Data (xor)", (Byte*)&data, 16, true);
for (unsigned int i=1; i<rounds-1; ++i)
{
data = _mm_aesenc_si128(data, rdkeys[i]);
Print("Key", (Byte*)&rdkeys[i], 16);
Print("Data", (Byte*)&data, 16, true);
}
data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
Print("Key", (Byte*)&rdkeys[rounds-1], 16);
Print("Data", (Byte*)&data, 16, true);
data = _mm_aesenclast_si128(data, rdkeys[rounds]);
Print("Key", (Byte*)&rdkeys[rounds], 16);
Print("Data", (Byte*)&data, 16, true);
_mm_storeu_si128((__m128i*)out, data);
#endif
Print("Out", out, 16);
}
最佳答案
看来答案是肯定的。我仍然需要针对真实的 key 调度进行测试,但是我能够使用相同的 key 调度对 Intel 和 ARMv8 内在函数产生相同的结果。
看起来 Crutchfield 的引用实现中有一个一对一。它应该使用 rounds-1
,而不是 rounds
作为循环控制。这意味着我用 11 轮而不是 10 轮测试 ARMv8。当 ARMv8 代码生成 F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9
而不是 F9 F9 ... F9 F9
时,我应该怀疑它。
这是更新后的代码:
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)
uint8x16_t data = vld1q_u8(in);
unsigned int i;
for (i=0; i<rounds-1; ++i)
{
data = vaeseq_u8(data, rdkeys[i]);
data = vaesmcq_u8(data);
}
data = vaeseq_u8(data, rdkeys[i++]);
data = veorq_u8(data, rdkeys[i]);
vst1q_u8(out, data);
#elif defined(__AES__)
__m128i data = _mm_loadu_si128((const __m128i*)in);
data = _mm_xor_si128(data, rdkeys[0]);
unsigned int i;
for (i=1; i<rounds-1; ++i)
{
data = _mm_aesenc_si128(data, rdkeys[i]);
}
data = _mm_aesenc_si128(data, rdkeys[i++]);
data = _mm_aesenclast_si128(data, rdkeys[i]);
_mm_storeu_si128((__m128i*)out, data);
#endif
}
关于c - ARM 和 Intel 内在函数是否对 AES 使用相同的子 key 计划?,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/45528569/