AVX 256-bit code performing slightly worse than equivalent 128-bit SSSE3 code
我正在写非常有效的汉明距离码。灵感来源于wojciech mu?A非常聪明的SSE3popcount实现,我编写了一个AVX2等效的解决方案,这次使用256位寄存器。基于所涉及的操作的两倍并行性,我曾期望至少有30%-40%的改进,但是令我惊讶的是,avx2代码慢了一点(大约2%)。
有人能告诉我为什么我没有获得预期的性能提升的可能原因吗?
展开,SSE3-Hamming两个64字节块的距离:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | INT32 SSE_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) { __m128i paccum = _mm_setzero_si128(); __m128i a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA)); __m128i b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB)); __m128i err = _mm_xor_si128 (a, b); __m128i lo = _mm_and_si128 (err, low_mask); __m128i hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo); __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 4)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 4)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 8)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 8)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 12)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 12)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); paccum = _mm_sad_epu8(paccum, _mm_setzero_si128()); UINT64 result = paccum.m128i_u64[0] + paccum.m128i_u64[1]; return (INT32)result; } |
使用AVX的256位寄存器的未展开等效版本:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | INT32 AVX_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) { __m256i paccum = _mm256_setzero_si256(); __m256i a = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA)); __m256i b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB)); __m256i err = _mm256_xor_si256 (a, b); __m256i lo = _mm256_and_si256 (err, low_mask256); __m256i hi = _mm256_srli_epi16 (err, 4); hi = _mm256_and_si256 (hi, low_mask256); __m256i popcnt1 = _mm256_shuffle_epi8(lookup256, lo); __m256i popcnt2 = _mm256_shuffle_epi8(lookup256, hi); paccum = _mm256_add_epi8(paccum, popcnt1); paccum = _mm256_add_epi8(paccum, popcnt2); a = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA + 8)); b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB + 8)); err = _mm256_xor_si256 (a, b); lo = _mm256_and_si256 (err, low_mask256); hi = _mm256_srli_epi16 (err, 4); hi = _mm256_and_si256 (hi, low_mask256); popcnt1 = _mm256_shuffle_epi8(lookup256, lo); popcnt2 = _mm256_shuffle_epi8(lookup256, hi); paccum = _mm256_add_epi8(paccum, popcnt1); paccum = _mm256_add_epi8(paccum, popcnt2); paccum = _mm256_sad_epu8(paccum, _mm256_setzero_si256()); UINT64 result = paccum.m256i_i64[0] + paccum.m256i_u64[1] + paccum.m256i_i64[2] + paccum.m256i_i64[3]; return (INT32)result; } |
我已经验证了编译器发出的输出程序集代码,它看起来很好,预期将内部指令直接转换为机器指令。我注意到的唯一一件事是,在AVX2版本中,4个四字的填充计数是在最后一行累积的,它生成的代码比SSE3版本更复杂(在SSE3版本中,只有2个四字需要累积才能获得填充计数),但是我仍然希望吞吐量更快。
为四字累积生成的AVX2代码
1 2 3 4 5 6 7 8 9 10 11 | vextractf128 xmm0, ymm2, 1 psrldq xmm0, 8 movd ecx, xmm2 movd eax, xmm0 vextractf128 xmm0, ymm2, 1 psrldq xmm2, 8 add eax, ecx movd ecx, xmm0 add eax, ecx movd ecx, xmm2 add eax, ecx |
为四字累积生成的SSE3代码
1 2 3 4 | movd ecx, xmm2 psrldq xmm2, 8 movd eax, xmm2 add eax, ecx |
我的测试程序在每个例程中调用100万次,使用不同的输入值,但是重用两个静态缓冲区来保存
测试例程
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | int _tmain(int argc, _TCHAR* argv[]) { lookup = _mm_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); low_mask = _mm_set1_epi8(0xf); lookup256 = _mm256_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); low_mask256 = _mm256_set1_epi8(0xf); std::default_random_engine generator; generator.seed(37); std::uniform_int_distribution<UINT32> distribution(0, ULONG_MAX); auto dice = std::bind( distribution, generator); UINT32 a[16]; UINT32 b[16]; int count; count = 0; { cout <<"AVX PopCount "; boost::timer::auto_cpu_timer t; for( int i = 0; i < 1000000; i++ ) { for( int j = 0; j < 16; j++ ) { a[j] = dice(); b[j] = dice(); } count+= AVX_PopCount(a, b); } } cout << count <<" "; std::default_random_engine generator2; generator2.seed(37); std::uniform_int_distribution<UINT32> distribution2(0, ULONG_MAX); auto dice2 = std::bind( distribution2, generator2); count = 0; { cout <<"SSE PopCount "; boost::timer::auto_cpu_timer t; for( int i = 0; i < 1000000; i++ ) { for( int j = 0; j < 16; j++ ) { a[j] = dice2(); b[j] = dice2(); } count+= SSE_PopCount(a, b); } } cout << count <<" "; getch(); return 0; } |
测试机是Intel Corei7 4790,我使用的是Visual Studio 2012 Pro。
除了注释中的小问题(为
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | int count; count = 0; { cout <<"AVX PopCount "; unsigned int Tick = GetTickCount(); for (int i = 0; i < 1000000; i++) { for (int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } count += AVX_PopCount(a, b); } Tick = GetTickCount() - Tick; cout << Tick <<" "; } |
产生输出:
AVX PopCount
2309
256002470
所以2309毫秒完成…但是如果我们完全摆脱你的AVX程序会发生什么呢?只需使输入数组:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | int count; count = 0; { cout <<"Just making arrays... "; unsigned int Tick = GetTickCount(); for (int i = 0; i < 1000000; i++) { for (int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } } Tick = GetTickCount() - Tick; cout << Tick <<" "; } |
产生输出:
Just making arrays...
2246
那怎么样?真的,这并不奇怪,因为您生成了32个随机数,这可能非常昂贵,然后只执行一些相当快的整数运算和随机移动。
所以…
现在让我们再增加100个迭代的系数,让随机生成器脱离紧环。在禁用优化的情况下在此处编译将按预期运行代码,并且不会丢弃"无用"的迭代-假设我们关心的代码已经(手动)优化了!
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | for (int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } int count; count = 0; { cout <<"AVX PopCount "; unsigned int Tick = GetTickCount(); for (int i = 0; i < 100000000; i++) { count += AVX_PopCount(a, b); } Tick = GetTickCount() - Tick; cout << Tick <<" "; } cout << count <<" "; count = 0; { cout <<"SSE PopCount "; unsigned int Tick = GetTickCount(); for (int i = 0; i < 100000000; i++) { count += SSE_PopCount(a, b); } Tick = GetTickCount() - Tick; cout << Tick <<" "; } cout << count <<" "; |
产生输出:
AVX PopCount
3744
730196224
SSE PopCount
5616
730196224
所以恭喜你-你可以拍拍自己的背,你的AVX程序确实比SSE程序快三分之一(这里测试的是Haswell I7)。这一课是要确保你确实在分析你认为你在分析的东西!
您应该考虑使用通常的
试试这个,应该是最快的:
1 2 3 4 5 6 | int popcount256(const uint64_t* u){ return _mm_popcnt_u64(u[0]); + _mm_popcnt_u64(u[1]); + _mm_popcnt_u64(u[2]); + _mm_popcnt_u64(u[3]); } |
我知道这并不能回答你的核心问题,为什么AVX速度较慢,但由于你的最终目标是快速popcount,因此AVX<->SSE比较与此无关,因为两者都不如内置popcount。