LCOV - code coverage report
Current view: top level - src/libavcodec/x86 - simple_idct.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 16 16 100.0 %
Date: 2017-01-21 09:32:20 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /*
       2             :  * Simple IDCT MMX
       3             :  *
       4             :  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
       5             :  *
       6             :  * This file is part of FFmpeg.
       7             :  *
       8             :  * FFmpeg is free software; you can redistribute it and/or
       9             :  * modify it under the terms of the GNU Lesser General Public
      10             :  * License as published by the Free Software Foundation; either
      11             :  * version 2.1 of the License, or (at your option) any later version.
      12             :  *
      13             :  * FFmpeg is distributed in the hope that it will be useful,
      14             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      16             :  * Lesser General Public License for more details.
      17             :  *
      18             :  * You should have received a copy of the GNU Lesser General Public
      19             :  * License along with FFmpeg; if not, write to the Free Software
      20             :  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
      21             :  */
      22             : 
      23             : #include "libavutil/mem.h"
      24             : #include "libavutil/x86/asm.h"
      25             : 
      26             : #include "libavcodec/idctdsp.h"
      27             : 
      28             : #include "idctdsp.h"
      29             : #include "simple_idct.h"
      30             : 
      31             : #if HAVE_INLINE_ASM
      32             : 
      33             : /*
      34             : 23170.475006
      35             : 22725.260826
      36             : 21406.727617
      37             : 19265.545870
      38             : 16384.000000
      39             : 12872.826198
      40             : 8866.956905
      41             : 4520.335430
      42             : */
      43             : #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      44             : #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      45             : #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      46             : #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      47             : #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
      48             : #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      49             : #define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      50             : #define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
      51             : 
      52             : #define ROW_SHIFT 11
      53             : #define COL_SHIFT 20 // 6
      54             : 
      55             : DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
      56             : DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
      57             : 
      58             : DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
      59             :         1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
      60             : //        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
      61             : //        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
      62             :         1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
      63             :         // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
      64             : //        0, 0, 0, 0,
      65             : //        0, 0, 0, 0,
      66             : 
      67             :  C4,  C4,  C4,  C4,
      68             :  C4, -C4,  C4, -C4,
      69             : 
      70             :  C2,  C6,  C2,  C6,
      71             :  C6, -C2,  C6, -C2,
      72             : 
      73             :  C1,  C3,  C1,  C3,
      74             :  C5,  C7,  C5,  C7,
      75             : 
      76             :  C3, -C7,  C3, -C7,
      77             : -C1, -C5, -C1, -C5,
      78             : 
      79             :  C5, -C1,  C5, -C1,
      80             :  C7,  C3,  C7,  C3,
      81             : 
      82             :  C7, -C5,  C7, -C5,
      83             :  C3, -C1,  C3, -C1
      84             : };
      85             : 
      86      949565 : static inline void idct(int16_t *block)
      87             : {
      88      949565 :         LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
      89      949565 :         int16_t * const temp= (int16_t*)align_tmp;
      90             : 
      91      949565 :         __asm__ volatile(
      92             : #if 0 //Alternative, simpler variant
      93             : 
      94             : #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
      95             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
      96             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
      97             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
      98             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
      99             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     100             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     101             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     102             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     103             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     104             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     105             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     106             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     107             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     108             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     109             :         #rounder ", %%mm4               \n\t"\
     110             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     111             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     112             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     113             :         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
     114             :         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     115             :         #rounder ", %%mm0               \n\t"\
     116             :         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
     117             :         "paddd %%mm0, %%mm0             \n\t" \
     118             :         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
     119             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     120             :         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
     121             :         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
     122             :         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     123             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     124             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     125             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     126             :         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
     127             :         "psrad $" #shift ", %%mm7       \n\t"\
     128             :         "psrad $" #shift ", %%mm4       \n\t"\
     129             :         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
     130             :         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
     131             :         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     132             :         "psrad $" #shift ", %%mm1       \n\t"\
     133             :         "psrad $" #shift ", %%mm2       \n\t"\
     134             :         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
     135             :         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
     136             :         "movq %%mm7, " #dst "           \n\t"\
     137             :         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
     138             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     139             :         "movq %%mm2, 24+" #dst "        \n\t"\
     140             :         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     141             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     142             :         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     143             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     144             :         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
     145             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     146             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     147             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     148             :         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
     149             :         "psrad $" #shift ", %%mm2       \n\t"\
     150             :         "psrad $" #shift ", %%mm0       \n\t"\
     151             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     152             :         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
     153             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     154             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     155             :         "psrad $" #shift ", %%mm6       \n\t"\
     156             :         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
     157             :         "movq %%mm2, 8+" #dst "         \n\t"\
     158             :         "psrad $" #shift ", %%mm4       \n\t"\
     159             :         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
     160             :         "movq %%mm4, 16+" #dst "        \n\t"\
     161             : 
     162             : #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
     163             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     164             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     165             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     166             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     167             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     168             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     169             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     170             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     171             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     172             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     173             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     174             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     175             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     176             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     177             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     178             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     179             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     180             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     181             :         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
     182             :         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
     183             :         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
     184             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     185             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     186             :         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
     187             :         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
     188             :         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     189             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     190             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     191             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     192             :         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
     193             :         "psrad $" #shift ", %%mm7       \n\t"\
     194             :         "psrad $" #shift ", %%mm4       \n\t"\
     195             :         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
     196             :         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     197             :         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     198             :         "psrad $" #shift ", %%mm0       \n\t"\
     199             :         "psrad $" #shift ", %%mm2       \n\t"\
     200             :         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
     201             :         "movd %%mm7, " #dst "           \n\t"\
     202             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     203             :         "movd %%mm0, 16+" #dst "        \n\t"\
     204             :         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
     205             :         "movd %%mm2, 96+" #dst "        \n\t"\
     206             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     207             :         "movd %%mm4, 112+" #dst "       \n\t"\
     208             :         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
     209             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     210             :         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     211             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     212             :         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     213             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     214             :         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
     215             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     216             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     217             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     218             :         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     219             :         "psrad $" #shift ", %%mm2       \n\t"\
     220             :         "psrad $" #shift ", %%mm5       \n\t"\
     221             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     222             :         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
     223             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     224             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     225             :         "psrad $" #shift ", %%mm6       \n\t"\
     226             :         "psrad $" #shift ", %%mm4       \n\t"\
     227             :         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
     228             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     229             :         "movd %%mm2, 32+" #dst "        \n\t"\
     230             :         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
     231             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     232             :         "movd %%mm6, 48+" #dst "        \n\t"\
     233             :         "movd %%mm4, 64+" #dst "        \n\t"\
     234             :         "movd %%mm5, 80+" #dst "        \n\t"\
     235             : 
     236             : 
     237             : #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
     238             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     239             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     240             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     241             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     242             :         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
     243             :         "pand %%mm0, %%mm4              \n\t"\
     244             :         "por %%mm1, %%mm4               \n\t"\
     245             :         "por %%mm2, %%mm4               \n\t"\
     246             :         "por %%mm3, %%mm4               \n\t"\
     247             :         "packssdw %%mm4,%%mm4           \n\t"\
     248             :         "movd %%mm4, %%eax              \n\t"\
     249             :         "orl %%eax, %%eax               \n\t"\
     250             :         "jz 1f                          \n\t"\
     251             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     252             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     253             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     254             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     255             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     256             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     257             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     258             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     259             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     260             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     261             :         #rounder ", %%mm4               \n\t"\
     262             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     263             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     264             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     265             :         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
     266             :         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     267             :         #rounder ", %%mm0               \n\t"\
     268             :         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
     269             :         "paddd %%mm0, %%mm0             \n\t" \
     270             :         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
     271             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     272             :         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
     273             :         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
     274             :         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     275             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     276             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     277             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     278             :         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
     279             :         "psrad $" #shift ", %%mm7       \n\t"\
     280             :         "psrad $" #shift ", %%mm4       \n\t"\
     281             :         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
     282             :         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
     283             :         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     284             :         "psrad $" #shift ", %%mm1       \n\t"\
     285             :         "psrad $" #shift ", %%mm2       \n\t"\
     286             :         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
     287             :         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
     288             :         "movq %%mm7, " #dst "           \n\t"\
     289             :         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
     290             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     291             :         "movq %%mm2, 24+" #dst "        \n\t"\
     292             :         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     293             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     294             :         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     295             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     296             :         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
     297             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     298             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     299             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     300             :         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
     301             :         "psrad $" #shift ", %%mm2       \n\t"\
     302             :         "psrad $" #shift ", %%mm0       \n\t"\
     303             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     304             :         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
     305             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     306             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     307             :         "psrad $" #shift ", %%mm6       \n\t"\
     308             :         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
     309             :         "movq %%mm2, 8+" #dst "         \n\t"\
     310             :         "psrad $" #shift ", %%mm4       \n\t"\
     311             :         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
     312             :         "movq %%mm4, 16+" #dst "        \n\t"\
     313             :         "jmp 2f                         \n\t"\
     314             :         "1:                             \n\t"\
     315             :         "pslld $16, %%mm0               \n\t"\
     316             :         "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
     317             :         "psrad $13, %%mm0               \n\t"\
     318             :         "packssdw %%mm0, %%mm0          \n\t"\
     319             :         "movq %%mm0, " #dst "           \n\t"\
     320             :         "movq %%mm0, 8+" #dst "         \n\t"\
     321             :         "movq %%mm0, 16+" #dst "        \n\t"\
     322             :         "movq %%mm0, 24+" #dst "        \n\t"\
     323             :         "2:                             \n\t"
     324             : 
     325             : 
     326             : //IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
     327             : ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
     328             : /*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
     329             : ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
     330             : ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
     331             : 
     332             : DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
     333             : DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
     334             : DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
     335             : 
     336             : 
     337             : //IDCT(      src0,   src4,   src1,    src5,    dst, shift)
     338             : COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     339             : COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     340             : COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     341             : COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     342             : 
     343             : #else
     344             : 
     345             : #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
     346             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     347             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     348             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     349             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     350             :         "movq "MANGLE(wm1010)", %%mm4   \n\t"\
     351             :         "pand %%mm0, %%mm4              \n\t"\
     352             :         "por %%mm1, %%mm4               \n\t"\
     353             :         "por %%mm2, %%mm4               \n\t"\
     354             :         "por %%mm3, %%mm4               \n\t"\
     355             :         "packssdw %%mm4,%%mm4           \n\t"\
     356             :         "movd %%mm4, %%eax              \n\t"\
     357             :         "orl %%eax, %%eax               \n\t"\
     358             :         "jz 1f                          \n\t"\
     359             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     360             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     361             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     362             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     363             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     364             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     365             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     366             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     367             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     368             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     369             :         #rounder ", %%mm4               \n\t"\
     370             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     371             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     372             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     373             :         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
     374             :         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     375             :         #rounder ", %%mm0               \n\t"\
     376             :         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
     377             :         "paddd %%mm0, %%mm0             \n\t" \
     378             :         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
     379             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     380             :         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
     381             :         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
     382             :         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     383             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     384             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     385             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     386             :         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
     387             :         "psrad $" #shift ", %%mm7       \n\t"\
     388             :         "psrad $" #shift ", %%mm4       \n\t"\
     389             :         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
     390             :         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
     391             :         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     392             :         "psrad $" #shift ", %%mm1       \n\t"\
     393             :         "psrad $" #shift ", %%mm2       \n\t"\
     394             :         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
     395             :         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
     396             :         "movq %%mm7, " #dst "           \n\t"\
     397             :         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
     398             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     399             :         "movq %%mm2, 24+" #dst "        \n\t"\
     400             :         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     401             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     402             :         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     403             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     404             :         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
     405             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     406             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     407             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     408             :         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
     409             :         "psrad $" #shift ", %%mm2       \n\t"\
     410             :         "psrad $" #shift ", %%mm0       \n\t"\
     411             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     412             :         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
     413             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     414             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     415             :         "psrad $" #shift ", %%mm6       \n\t"\
     416             :         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
     417             :         "movq %%mm2, 8+" #dst "         \n\t"\
     418             :         "psrad $" #shift ", %%mm4       \n\t"\
     419             :         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
     420             :         "movq %%mm4, 16+" #dst "        \n\t"\
     421             :         "jmp 2f                         \n\t"\
     422             :         "1:                             \n\t"\
     423             :         "pslld $16, %%mm0               \n\t"\
     424             :         "paddd "MANGLE(d40000)", %%mm0  \n\t"\
     425             :         "psrad $13, %%mm0               \n\t"\
     426             :         "packssdw %%mm0, %%mm0          \n\t"\
     427             :         "movq %%mm0, " #dst "           \n\t"\
     428             :         "movq %%mm0, 8+" #dst "         \n\t"\
     429             :         "movq %%mm0, 16+" #dst "        \n\t"\
     430             :         "movq %%mm0, 24+" #dst "        \n\t"\
     431             :         "2:                             \n\t"
     432             : 
     433             : #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
     434             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     435             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     436             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     437             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     438             :         "movq %%mm0, %%mm4              \n\t"\
     439             :         "por %%mm1, %%mm4               \n\t"\
     440             :         "por %%mm2, %%mm4               \n\t"\
     441             :         "por %%mm3, %%mm4               \n\t"\
     442             :         "packssdw %%mm4,%%mm4           \n\t"\
     443             :         "movd %%mm4, %%eax              \n\t"\
     444             :         "orl %%eax, %%eax               \n\t"\
     445             :         "jz " #bt "                     \n\t"\
     446             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     447             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     448             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     449             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     450             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     451             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     452             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     453             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     454             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     455             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     456             :         #rounder ", %%mm4               \n\t"\
     457             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     458             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     459             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     460             :         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
     461             :         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     462             :         #rounder ", %%mm0               \n\t"\
     463             :         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
     464             :         "paddd %%mm0, %%mm0             \n\t" \
     465             :         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
     466             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     467             :         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
     468             :         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
     469             :         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     470             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     471             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     472             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     473             :         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
     474             :         "psrad $" #shift ", %%mm7       \n\t"\
     475             :         "psrad $" #shift ", %%mm4       \n\t"\
     476             :         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
     477             :         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
     478             :         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     479             :         "psrad $" #shift ", %%mm1       \n\t"\
     480             :         "psrad $" #shift ", %%mm2       \n\t"\
     481             :         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
     482             :         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
     483             :         "movq %%mm7, " #dst "           \n\t"\
     484             :         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
     485             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     486             :         "movq %%mm2, 24+" #dst "        \n\t"\
     487             :         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     488             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     489             :         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     490             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     491             :         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
     492             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     493             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     494             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     495             :         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
     496             :         "psrad $" #shift ", %%mm2       \n\t"\
     497             :         "psrad $" #shift ", %%mm0       \n\t"\
     498             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     499             :         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
     500             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     501             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     502             :         "psrad $" #shift ", %%mm6       \n\t"\
     503             :         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
     504             :         "movq %%mm2, 8+" #dst "         \n\t"\
     505             :         "psrad $" #shift ", %%mm4       \n\t"\
     506             :         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
     507             :         "movq %%mm4, 16+" #dst "        \n\t"\
     508             : 
     509             : #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
     510             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     511             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     512             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     513             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     514             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     515             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     516             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     517             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     518             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     519             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     520             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     521             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     522             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     523             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     524             :         #rounder ", %%mm4               \n\t"\
     525             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     526             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     527             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     528             :         "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
     529             :         "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     530             :         #rounder ", %%mm0               \n\t"\
     531             :         "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
     532             :         "paddd %%mm0, %%mm0             \n\t" \
     533             :         "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
     534             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     535             :         "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
     536             :         "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
     537             :         "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     538             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     539             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     540             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     541             :         "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
     542             :         "psrad $" #shift ", %%mm7       \n\t"\
     543             :         "psrad $" #shift ", %%mm4       \n\t"\
     544             :         "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
     545             :         "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
     546             :         "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     547             :         "psrad $" #shift ", %%mm1       \n\t"\
     548             :         "psrad $" #shift ", %%mm2       \n\t"\
     549             :         "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
     550             :         "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
     551             :         "movq %%mm7, " #dst "           \n\t"\
     552             :         "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
     553             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     554             :         "movq %%mm2, 24+" #dst "        \n\t"\
     555             :         "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     556             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     557             :         "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     558             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     559             :         "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
     560             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     561             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     562             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     563             :         "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
     564             :         "psrad $" #shift ", %%mm2       \n\t"\
     565             :         "psrad $" #shift ", %%mm0       \n\t"\
     566             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     567             :         "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
     568             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     569             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     570             :         "psrad $" #shift ", %%mm6       \n\t"\
     571             :         "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
     572             :         "movq %%mm2, 8+" #dst "         \n\t"\
     573             :         "psrad $" #shift ", %%mm4       \n\t"\
     574             :         "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
     575             :         "movq %%mm4, 16+" #dst "        \n\t"\
     576             : 
     577             : //IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
     578             : DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
     579             : Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
     580             : Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
     581             : Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
     582             : 
     583             : #undef IDCT
     584             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     585             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     586             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     587             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     588             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     589             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     590             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     591             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     592             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     593             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     594             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     595             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     596             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     597             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     598             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     599             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     600             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     601             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     602             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     603             :         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
     604             :         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
     605             :         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
     606             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     607             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     608             :         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
     609             :         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
     610             :         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     611             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     612             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     613             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     614             :         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
     615             :         "psrad $" #shift ", %%mm7       \n\t"\
     616             :         "psrad $" #shift ", %%mm4       \n\t"\
     617             :         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
     618             :         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     619             :         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     620             :         "psrad $" #shift ", %%mm0       \n\t"\
     621             :         "psrad $" #shift ", %%mm2       \n\t"\
     622             :         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
     623             :         "movd %%mm7, " #dst "           \n\t"\
     624             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     625             :         "movd %%mm0, 16+" #dst "        \n\t"\
     626             :         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
     627             :         "movd %%mm2, 96+" #dst "        \n\t"\
     628             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     629             :         "movd %%mm4, 112+" #dst "       \n\t"\
     630             :         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
     631             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     632             :         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     633             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     634             :         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     635             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     636             :         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
     637             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     638             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     639             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     640             :         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     641             :         "psrad $" #shift ", %%mm2       \n\t"\
     642             :         "psrad $" #shift ", %%mm5       \n\t"\
     643             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     644             :         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
     645             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     646             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     647             :         "psrad $" #shift ", %%mm6       \n\t"\
     648             :         "psrad $" #shift ", %%mm4       \n\t"\
     649             :         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
     650             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     651             :         "movd %%mm2, 32+" #dst "        \n\t"\
     652             :         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
     653             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     654             :         "movd %%mm6, 48+" #dst "        \n\t"\
     655             :         "movd %%mm4, 64+" #dst "        \n\t"\
     656             :         "movd %%mm5, 80+" #dst "        \n\t"
     657             : 
     658             : 
     659             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
     660             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     661             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     662             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     663             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     664             :         "jmp 9f                         \n\t"
     665             : 
     666             :         "# .p2align 4                   \n\t"\
     667             :         "4:                             \n\t"
     668             : Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
     669             : Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
     670             : 
     671             : #undef IDCT
     672             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     673             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     674             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     675             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     676             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     677             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     678             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     679             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     680             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     681             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     682             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     683             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     684             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     685             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     686             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     687             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     688             :         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
     689             :         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
     690             :         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
     691             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     692             :         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
     693             :         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     694             :         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
     695             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     696             :         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     697             :         "psrad $" #shift ", %%mm1       \n\t"\
     698             :         "psrad $" #shift ", %%mm4       \n\t"\
     699             :         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
     700             :         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     701             :         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     702             :         "psrad $" #shift ", %%mm0       \n\t"\
     703             :         "psrad $" #shift ", %%mm2       \n\t"\
     704             :         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
     705             :         "movd %%mm1, " #dst "           \n\t"\
     706             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     707             :         "movd %%mm0, 16+" #dst "        \n\t"\
     708             :         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
     709             :         "movd %%mm2, 96+" #dst "        \n\t"\
     710             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     711             :         "movd %%mm4, 112+" #dst "       \n\t"\
     712             :         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
     713             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     714             :         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
     715             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     716             :         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     717             :         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     718             :         "psrad $" #shift ", %%mm2       \n\t"\
     719             :         "psrad $" #shift ", %%mm5       \n\t"\
     720             :         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
     721             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     722             :         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
     723             :         "psrad $" #shift ", %%mm6       \n\t"\
     724             :         "psrad $" #shift ", %%mm1       \n\t"\
     725             :         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
     726             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     727             :         "movd %%mm2, 32+" #dst "        \n\t"\
     728             :         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
     729             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     730             :         "movd %%mm6, 48+" #dst "        \n\t"\
     731             :         "movd %%mm1, 64+" #dst "        \n\t"\
     732             :         "movd %%mm5, 80+" #dst "        \n\t"
     733             : 
     734             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
     735             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     736             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     737             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     738             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     739             :         "jmp 9f                         \n\t"
     740             : 
     741             :         "# .p2align 4                   \n\t"\
     742             :         "6:                             \n\t"
     743             : Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
     744             : 
     745             : #undef IDCT
     746             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     747             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     748             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     749             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     750             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     751             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     752             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     753             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     754             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     755             :         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
     756             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     757             :         "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
     758             :         "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     759             :         "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
     760             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     761             :         "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     762             :         "psrad $" #shift ", %%mm1       \n\t"\
     763             :         "psrad $" #shift ", %%mm4       \n\t"\
     764             :         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
     765             :         "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     766             :         "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     767             :         "psrad $" #shift ", %%mm0       \n\t"\
     768             :         "psrad $" #shift ", %%mm2       \n\t"\
     769             :         "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
     770             :         "movd %%mm1, " #dst "           \n\t"\
     771             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     772             :         "movd %%mm0, 16+" #dst "        \n\t"\
     773             :         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
     774             :         "movd %%mm2, 96+" #dst "        \n\t"\
     775             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     776             :         "movd %%mm4, 112+" #dst "       \n\t"\
     777             :         "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
     778             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     779             :         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
     780             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     781             :         "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     782             :         "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     783             :         "psrad $" #shift ", %%mm2       \n\t"\
     784             :         "psrad $" #shift ", %%mm5       \n\t"\
     785             :         "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
     786             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     787             :         "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
     788             :         "psrad $" #shift ", %%mm6       \n\t"\
     789             :         "psrad $" #shift ", %%mm1       \n\t"\
     790             :         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
     791             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     792             :         "movd %%mm2, 32+" #dst "        \n\t"\
     793             :         "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
     794             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     795             :         "movd %%mm6, 48+" #dst "        \n\t"\
     796             :         "movd %%mm1, 64+" #dst "        \n\t"\
     797             :         "movd %%mm5, 80+" #dst "        \n\t"
     798             : 
     799             : 
     800             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
     801             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     802             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     803             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     804             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     805             :         "jmp 9f                         \n\t"
     806             : 
     807             :         "# .p2align 4                   \n\t"\
     808             :         "2:                             \n\t"
     809             : Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
     810             : 
     811             : #undef IDCT
     812             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     813             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     814             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     815             :         "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
     816             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     817             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     818             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     819             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     820             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     821             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     822             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     823             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     824             :         "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
     825             :         "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
     826             :         "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     827             :         "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
     828             :         "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
     829             :         "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
     830             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     831             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     832             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     833             :         "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
     834             :         "psrad $" #shift ", %%mm7       \n\t"\
     835             :         "psrad $" #shift ", %%mm4       \n\t"\
     836             :         "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
     837             :         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     838             :         "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
     839             :         "psrad $" #shift ", %%mm0       \n\t"\
     840             :         "psrad $" #shift ", %%mm2       \n\t"\
     841             :         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
     842             :         "movd %%mm7, " #dst "           \n\t"\
     843             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     844             :         "movd %%mm0, 16+" #dst "        \n\t"\
     845             :         "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
     846             :         "movd %%mm2, 96+" #dst "        \n\t"\
     847             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     848             :         "movd %%mm4, 112+" #dst "       \n\t"\
     849             :         "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
     850             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     851             :         "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     852             :         "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
     853             :         "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     854             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
     855             :         "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
     856             :         "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
     857             :         "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
     858             :         "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
     859             :         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     860             :         "psrad $" #shift ", %%mm2       \n\t"\
     861             :         "psrad $" #shift ", %%mm5       \n\t"\
     862             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     863             :         "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
     864             :         "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     865             :         "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     866             :         "psrad $" #shift ", %%mm6       \n\t"\
     867             :         "psrad $" #shift ", %%mm4       \n\t"\
     868             :         "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
     869             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     870             :         "movd %%mm2, 32+" #dst "        \n\t"\
     871             :         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
     872             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     873             :         "movd %%mm6, 48+" #dst "        \n\t"\
     874             :         "movd %%mm4, 64+" #dst "        \n\t"\
     875             :         "movd %%mm5, 80+" #dst "        \n\t"
     876             : 
     877             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
     878             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     879             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     880             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     881             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     882             :         "jmp 9f                         \n\t"
     883             : 
     884             :         "# .p2align 4                   \n\t"\
     885             :         "3:                             \n\t"
     886             : #undef IDCT
     887             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     888             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     889             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
     890             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     891             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     892             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     893             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     894             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     895             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
     896             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
     897             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     898             :         "movq 64(%2), %%mm3             \n\t"\
     899             :         "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
     900             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
     901             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
     902             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
     903             :         "psrad $" #shift ", %%mm7       \n\t"\
     904             :         "psrad $" #shift ", %%mm4       \n\t"\
     905             :         "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
     906             :         "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
     907             :         "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
     908             :         "psrad $" #shift ", %%mm0       \n\t"\
     909             :         "psrad $" #shift ", %%mm1       \n\t"\
     910             :         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
     911             :         "movd %%mm7, " #dst "           \n\t"\
     912             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
     913             :         "movd %%mm0, 16+" #dst "        \n\t"\
     914             :         "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
     915             :         "movd %%mm1, 96+" #dst "        \n\t"\
     916             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
     917             :         "movd %%mm4, 112+" #dst "       \n\t"\
     918             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
     919             :         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
     920             :         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
     921             :         "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
     922             :         "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
     923             :         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
     924             :         "psrad $" #shift ", %%mm1       \n\t"\
     925             :         "psrad $" #shift ", %%mm5       \n\t"\
     926             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
     927             :         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
     928             :         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
     929             :         "psrad $" #shift ", %%mm6       \n\t"\
     930             :         "psrad $" #shift ", %%mm4       \n\t"\
     931             :         "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
     932             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
     933             :         "movd %%mm1, 32+" #dst "        \n\t"\
     934             :         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
     935             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     936             :         "movd %%mm6, 48+" #dst "        \n\t"\
     937             :         "movd %%mm4, 64+" #dst "        \n\t"\
     938             :         "movd %%mm5, 80+" #dst "        \n\t"
     939             : 
     940             : 
     941             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
     942             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
     943             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
     944             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
     945             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
     946             :         "jmp 9f                         \n\t"
     947             : 
     948             :         "# .p2align 4                   \n\t"\
     949             :         "5:                             \n\t"
     950             : #undef IDCT
     951             : #define IDCT(src0, src4, src1, src5, dst, shift) \
     952             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
     953             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
     954             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
     955             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     956             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
     957             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     958             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
     959             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     960             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
     961             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     962             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     963             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
     964             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
     965             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     966             :         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
     967             :         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
     968             :         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
     969             :         "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
     970             :         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
     971             :         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
     972             :         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
     973             :         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
     974             :         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
     975             :         "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
     976             :         "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
     977             :         "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
     978             :         "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
     979             :         "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
     980             :         "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
     981             :         "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
     982             :         "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
     983             :         "psrad $" #shift ", %%mm4       \n\t"\
     984             :         "psrad $" #shift ", %%mm7       \n\t"\
     985             :         "psrad $" #shift ", %%mm3       \n\t"\
     986             :         "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
     987             :         "movq %%mm4, " #dst "           \n\t"\
     988             :         "psrad $" #shift ", %%mm0       \n\t"\
     989             :         "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
     990             :         "movq %%mm0, 16+" #dst "        \n\t"\
     991             :         "movq %%mm0, 96+" #dst "        \n\t"\
     992             :         "movq %%mm4, 112+" #dst "       \n\t"\
     993             :         "psrad $" #shift ", %%mm5       \n\t"\
     994             :         "psrad $" #shift ", %%mm6       \n\t"\
     995             :         "psrad $" #shift ", %%mm2       \n\t"\
     996             :         "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
     997             :         "movq %%mm5, 32+" #dst "        \n\t"\
     998             :         "psrad $" #shift ", %%mm1       \n\t"\
     999             :         "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
    1000             :         "movq %%mm6, 48+" #dst "        \n\t"\
    1001             :         "movq %%mm6, 64+" #dst "        \n\t"\
    1002             :         "movq %%mm5, 80+" #dst "        \n\t"
    1003             : 
    1004             : 
    1005             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
    1006             : IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
    1007             : //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
    1008             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
    1009             : //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
    1010             :         "jmp 9f                         \n\t"
    1011             : 
    1012             : 
    1013             :         "# .p2align 4                   \n\t"\
    1014             :         "1:                             \n\t"
    1015             : #undef IDCT
    1016             : #define IDCT(src0, src4, src1, src5, dst, shift) \
    1017             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
    1018             :         "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
    1019             :         "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
    1020             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
    1021             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
    1022             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
    1023             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
    1024             :         "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
    1025             :         "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
    1026             :         "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
    1027             :         "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
    1028             :         "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
    1029             :         "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
    1030             :         "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
    1031             :         "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
    1032             :         "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
    1033             :         "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
    1034             :         "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
    1035             :         "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
    1036             :         "movq 64(%2), %%mm1             \n\t"\
    1037             :         "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
    1038             :         "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
    1039             :         "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
    1040             :         "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
    1041             :         "psrad $" #shift ", %%mm7       \n\t"\
    1042             :         "psrad $" #shift ", %%mm4       \n\t"\
    1043             :         "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
    1044             :         "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
    1045             :         "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
    1046             :         "psrad $" #shift ", %%mm0       \n\t"\
    1047             :         "psrad $" #shift ", %%mm3       \n\t"\
    1048             :         "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
    1049             :         "movd %%mm7, " #dst "           \n\t"\
    1050             :         "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
    1051             :         "movd %%mm0, 16+" #dst "        \n\t"\
    1052             :         "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
    1053             :         "movd %%mm3, 96+" #dst "        \n\t"\
    1054             :         "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
    1055             :         "movd %%mm4, 112+" #dst "       \n\t"\
    1056             :         "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
    1057             :         "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
    1058             :         "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
    1059             :         "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
    1060             :         "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
    1061             :         "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
    1062             :         "psrad $" #shift ", %%mm3       \n\t"\
    1063             :         "psrad $" #shift ", %%mm5       \n\t"\
    1064             :         "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
    1065             :         "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
    1066             :         "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
    1067             :         "psrad $" #shift ", %%mm6       \n\t"\
    1068             :         "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
    1069             :         "movd %%mm3, 32+" #dst "        \n\t"\
    1070             :         "psrad $" #shift ", %%mm4       \n\t"\
    1071             :         "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
    1072             :         "movd %%mm6, 48+" #dst "        \n\t"\
    1073             :         "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
    1074             :         "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
    1075             :         "movd %%mm4, 64+" #dst "        \n\t"\
    1076             :         "movd %%mm5, 80+" #dst "        \n\t"
    1077             : 
    1078             : 
    1079             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
    1080             : IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
    1081             : IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
    1082             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
    1083             : IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
    1084             :         "jmp 9f                         \n\t"
    1085             : 
    1086             : 
    1087             :         "# .p2align 4                   \n\t"
    1088             :         "7:                             \n\t"
    1089             : #undef IDCT
    1090             : #define IDCT(src0, src4, src1, src5, dst, shift) \
    1091             :         "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
    1092             :         "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
    1093             :         "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
    1094             :         "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
    1095             :         "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
    1096             :         "psrad $" #shift ", %%mm4       \n\t"\
    1097             :         "psrad $" #shift ", %%mm0       \n\t"\
    1098             :         "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
    1099             :         "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
    1100             :         "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
    1101             :         "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
    1102             :         "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
    1103             :         "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
    1104             :         "psrad $" #shift ", %%mm1       \n\t"\
    1105             :         "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
    1106             :         "movq %%mm4, " #dst "           \n\t"\
    1107             :         "psrad $" #shift ", %%mm2       \n\t"\
    1108             :         "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
    1109             :         "movq %%mm0, 16+" #dst "        \n\t"\
    1110             :         "movq %%mm0, 96+" #dst "        \n\t"\
    1111             :         "movq %%mm4, 112+" #dst "       \n\t"\
    1112             :         "movq %%mm0, 32+" #dst "        \n\t"\
    1113             :         "movq %%mm4, 48+" #dst "        \n\t"\
    1114             :         "movq %%mm4, 64+" #dst "        \n\t"\
    1115             :         "movq %%mm0, 80+" #dst "        \n\t"
    1116             : 
    1117             : //IDCT(  src0,   src4,   src1,    src5,    dst, shift)
    1118             : IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
    1119             : //IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
    1120             : IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
    1121             : //IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
    1122             : 
    1123             : 
    1124             : #endif
    1125             : 
    1126             : /*
    1127             : Input
    1128             :  00 40 04 44 20 60 24 64
    1129             :  10 30 14 34 50 70 54 74
    1130             :  01 41 03 43 21 61 23 63
    1131             :  11 31 13 33 51 71 53 73
    1132             :  02 42 06 46 22 62 26 66
    1133             :  12 32 16 36 52 72 56 76
    1134             :  05 45 07 47 25 65 27 67
    1135             :  15 35 17 37 55 75 57 77
    1136             : 
    1137             : Temp
    1138             :  00 04 10 14 20 24 30 34
    1139             :  40 44 50 54 60 64 70 74
    1140             :  01 03 11 13 21 23 31 33
    1141             :  41 43 51 53 61 63 71 73
    1142             :  02 06 12 16 22 26 32 36
    1143             :  42 46 52 56 62 66 72 76
    1144             :  05 07 15 17 25 27 35 37
    1145             :  45 47 55 57 65 67 75 77
    1146             : */
    1147             : 
    1148             : "9: \n\t"
    1149             :                 :: "r" (block), "r" (temp), "r" (coeffs)
    1150             :                    NAMED_CONSTRAINTS_ADD(wm1010,d40000)
    1151             :                 : "%eax"
    1152             :         );
    1153      949565 : }
    1154             : 
    1155       20000 : void ff_simple_idct_mmx(int16_t *block)
    1156             : {
    1157       20000 :     idct(block);
    1158       20000 : }
    1159             : 
    1160             : //FIXME merge add/put into the idct
    1161             : 
    1162      708505 : void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
    1163             : {
    1164      708505 :     idct(block);
    1165      708505 :     ff_put_pixels_clamped(block, dest, line_size);
    1166      708505 : }
    1167      221060 : void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
    1168             : {
    1169      221060 :     idct(block);
    1170      221060 :     ff_add_pixels_clamped(block, dest, line_size);
    1171      221060 : }
    1172             : 
    1173             : #endif /* HAVE_INLINE_ASM */

Generated by: LCOV version 1.12