Vector Class Discussion

tranpose 8x8 with AVX

Author: Date: 2013-06-04 07:50

I'm still using your vectorclass quite frequently. One of the things I wanted to do recently was an 8x8 transpose. I did this with 24 blends. However, I found a version using intrinsics which is much faster. In fact it's quicker to do four 4x4 transposes with SSE than to do one 8x8 transpose the way I have used the vectorclass. Do you have a suggestion how I could get the same speed with the vector class? I have provided code for two functions. One function uses Vec8f to do the transpose and the other uses intrinsics.

inline void transpose8_vec8f(Vec8f &row0, Vec8f &row1, Vec8f &row2, Vec8f &row3, Vec8f &row4, Vec8f &row5, Vec8f &row6, Vec8f &row7) { Vec8f tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; tmp0 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row0, row1); tmp1 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row2, row3); tmp2 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row4, row5); tmp3 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(row6, row7);

tmp4 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row0, row1); tmp5 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row2, row3); tmp6 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row4, row5); tmp7 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(row6, row7); tmp8 = blend8f<0, 4, 8, 12, 1, 5, 9 ,13>(tmp0, tmp1); tmp9 = blend8f<0, 4, 8, 12, 1, 5, 9, 13>(tmp2, tmp3); row0 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp8, tmp9); row1 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp8, tmp9);

tmp10 = blend8f<2, 6, 10, 14, 3, 7, 11 ,15>(tmp0, tmp1); tmp11 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(tmp2, tmp3); row2 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp10, tmp11); row3 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp10, tmp11);

tmp12 = blend8f<0, 4, 8, 12, 1, 5, 9 ,13>(tmp4, tmp5); tmp13 = blend8f<0, 4, 8, 12, 1, 5, 9, 13>(tmp6, tmp7); row4 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp12, tmp13); row5 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp12, tmp13);

tmp14 = blend8f<2, 6, 10, 14, 3, 7, 11 ,15>(tmp4, tmp5); tmp15 = blend8f<2, 6, 10, 14, 3, 7, 11, 15>(tmp6, tmp7); row6 = blend8f<0, 1, 2, 3, 8, 9, 10, 11>(tmp14, tmp15); row7 = blend8f<4, 5, 6, 7, 12, 13, 14, 15>(tmp14, tmp15); }

inline void transpose8_avx(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) { __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; __t0 = _mm256_unpacklo_ps(row0, row1); __t1 = _mm256_unpackhi_ps(row0, row1); __t2 = _mm256_unpacklo_ps(row2, row3); __t3 = _mm256_unpackhi_ps(row2, row3); __t4 = _mm256_unpacklo_ps(row4, row5); __t5 = _mm256_unpackhi_ps(row4, row5); __t6 = _mm256_unpacklo_ps(row6, row7); __t7 = _mm256_unpackhi_ps(row6, row7); __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); }

Reply To This Message

Next Message

tranpose 8x8 with AVX - chad - 2013-06-04

tranpose 8x8 with AVX new - Agner - 2013-06-05