// このようにソースを読み込んでいるとする。
__m64 row0 = src[0]; // 00 01 02 03 04 05 06 07
__m64 row1 = src[1]; // 10 11 12 13 14 15 16 17
__m64 row2 = src[2]; // 20 21 22 23 24 25 26 27
__m64 row3 = src[3]; // 30 31 32 33 34 35 36 37
__m64 row4 = src[4]; // 40 41 42 43 44 45 46 47
__m64 row5 = src[5]; // 50 51 52 53 54 55 56 57
__m64 row6 = src[6]; // 60 61 62 63 64 65 66 67
__m64 row7 = src[7]; // 70 71 72 73 74 75 76 77
// 係数行列の転置を行う関数
inline void transpose8_8x8_mmx(
__m64 row0, __m64 row1, __m64 row2, __m64 row3, __m64 row4, __m64 row5, __m64 row6, __m64 row7,
__m64& col0, __m64& col1, __m64& col2, __m64& col3, __m64& col4, __m64& col5, __m64& col6, __m64& col7 )
{
// 係数転置 ( フェーズ 1 )
__m64 tmp01L = _mm_unpacklo_pi8( row0, row1 ); // 00 10 01 11 02 12 03 13
__m64 tmp01H = _mm_unpackhi_pi8( row0, row1 ); // 04 14 05 15 06 16 07 17
__m64 tmp67L = _mm_unpacklo_pi8( row6, row7 ); // 60 70 61 71 62 72 63 73
__m64 tmp67H = _mm_unpackhi_pi8( row6, row7 ); // 64 74 65 75 66 76 67 77
__m64 tmp23L = _mm_unpacklo_pi8( row2, row3 ); // 20 30 21 31 22 32 23 33
__m64 tmp23H = _mm_unpackhi_pi8( row2, row3 ); // 24 34 25 35 26 36 27 37
__m64 tmp45L = _mm_unpacklo_pi8( row4, row5 ); // 40 50 41 51 42 52 43 53
__m64 tmp45H = _mm_unpackhi_pi8( row4, row5 ); // 44 54 45 55 46 56 47 57
// 係数転置 ( フェーズ 2 )
__m64 tmp0123LL = _mm_unpacklo_pi16( tmp01L, tmp23L ); // 00 10 20 30 01 11 21 31
__m64 tmp0123LH = _mm_unpackhi_pi16( tmp01L, tmp23L ); // 02 12 22 32 03 13 23 33
__m64 tmp0123HL = _mm_unpacklo_pi16( tmp01H, tmp23H ); // 04 14 24 34 05 15 25 35
__m64 tmp0123HH = _mm_unpackhi_pi16( tmp01H, tmp23H ); // 06 16 26 36 07 17 27 37
__m64 tmp4567LL = _mm_unpacklo_pi16( tmp45L, tmp67L ); // 40 50 60 70 41 51 61 71
__m64 tmp4567LH = _mm_unpackhi_pi16( tmp45L, tmp67L ); // 42 52 62 72 43 53 63 73
__m64 tmp4567HL = _mm_unpacklo_pi16( tmp45H, tmp67H ); // 44 54 64 74 45 55 65 75
__m64 tmp4567HH = _mm_unpackhi_pi16( tmp45H, tmp67H ); // 46 56 66 76 47 57 67 77
// 係数転置 ( フェーズ 3 )
col0 = _mm_unpacklo_pi32( tmp0123LL, tmp4567LL ); // 00 10 20 30 40 50 60 70
col1 = _mm_unpackhi_pi32( tmp0123LL, tmp4567LL ); // 01 11 21 31 41 51 61 71
col2 = _mm_unpacklo_pi32( tmp0123LH, tmp4567LH ); // 02 12 22 32 42 52 62 72
col3 = _mm_unpackhi_pi32( tmp0123LH, tmp4567LH ); // 03 13 23 33 43 53 63 73
col4 = _mm_unpacklo_pi32( tmp0123HL, tmp4567HL ); // 04 14 24 34 44 54 64 74
col5 = _mm_unpackhi_pi32( tmp0123HL, tmp4567HL ); // 05 15 25 35 45 55 65 75
col6 = _mm_unpacklo_pi32( tmp0123HH, tmp4567HH ); // 06 16 26 36 46 56 66 76
col7 = _mm_unpackhi_pi32( tmp0123HH, tmp4567HH ); // 07 17 27 37 47 57 67 77
}
|