LCOV - code coverage report
Current view: top level - src/libswscale/x86 - swscale_template.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 10 243 4.1 %
Date: 2017-03-16 17:17:40 Functions: 2 47 4.3 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
       3             :  *
       4             :  * This file is part of FFmpeg.
       5             :  *
       6             :  * FFmpeg is free software; you can redistribute it and/or
       7             :  * modify it under the terms of the GNU Lesser General Public
       8             :  * License as published by the Free Software Foundation; either
       9             :  * version 2.1 of the License, or (at your option) any later version.
      10             :  *
      11             :  * FFmpeg is distributed in the hope that it will be useful,
      12             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      13             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
      14             :  * Lesser General Public License for more details.
      15             :  *
      16             :  * You should have received a copy of the GNU Lesser General Public
      17             :  * License along with FFmpeg; if not, write to the Free Software
      18             :  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
      19             :  */
      20             : 
      21             : #include <stdint.h>
      22             : 
      23             : #include "libavutil/x86/asm.h"
      24             : #include "libswscale/swscale_internal.h"
      25             : 
      26             : #undef REAL_MOVNTQ
      27             : #undef MOVNTQ
      28             : #undef MOVNTQ2
      29             : #undef PREFETCH
      30             : 
      31             : 
      32             : #if COMPILE_TEMPLATE_MMXEXT
      33             : #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
      34             : #define MOVNTQ2 "movntq "
      35             : #else
      36             : #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
      37             : #define MOVNTQ2 "movq "
      38             : #endif
      39             : #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
      40             : 
      41             : #if !COMPILE_TEMPLATE_MMXEXT
      42             : static av_always_inline void
      43           0 : dither_8to16(const uint8_t *srcDither, int rot)
      44             : {
      45           0 :     if (rot) {
      46           0 :         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
      47             :                          "movq       (%0), %%mm3\n\t"
      48             :                          "movq      %%mm3, %%mm4\n\t"
      49             :                          "psrlq       $24, %%mm3\n\t"
      50             :                          "psllq       $40, %%mm4\n\t"
      51             :                          "por       %%mm4, %%mm3\n\t"
      52             :                          "movq      %%mm3, %%mm4\n\t"
      53             :                          "punpcklbw %%mm0, %%mm3\n\t"
      54             :                          "punpckhbw %%mm0, %%mm4\n\t"
      55             :                          :: "r"(srcDither)
      56             :                          );
      57             :     } else {
      58           0 :         __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
      59             :                          "movq       (%0), %%mm3\n\t"
      60             :                          "movq      %%mm3, %%mm4\n\t"
      61             :                          "punpcklbw %%mm0, %%mm3\n\t"
      62             :                          "punpckhbw %%mm0, %%mm4\n\t"
      63             :                          :: "r"(srcDither)
      64             :                          );
      65             :     }
      66           0 : }
      67             : #endif
      68             : 
      69           0 : static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
      70             :                            const int16_t **src, uint8_t *dest, int dstW,
      71             :                            const uint8_t *dither, int offset)
      72             : {
      73           0 :     dither_8to16(dither, offset);
      74           0 :     filterSize--;
      75           0 :     __asm__ volatile(
      76             :         "movd %0, %%mm1\n\t"
      77             :         "punpcklwd %%mm1, %%mm1\n\t"
      78             :         "punpckldq %%mm1, %%mm1\n\t"
      79             :         "psllw        $3, %%mm1\n\t"
      80             :         "paddw     %%mm1, %%mm3\n\t"
      81             :         "paddw     %%mm1, %%mm4\n\t"
      82             :         "psraw        $4, %%mm3\n\t"
      83             :         "psraw        $4, %%mm4\n\t"
      84             :         ::"m"(filterSize)
      85             :      );
      86             : 
      87           0 :     __asm__ volatile(\
      88             :         "movq    %%mm3, %%mm6\n\t"
      89             :         "movq    %%mm4, %%mm7\n\t"
      90             :         "movl %3, %%ecx\n\t"
      91             :         "mov                                 %0, %%"FF_REG_d"       \n\t"\
      92             :         "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
      93             :         ".p2align                             4                     \n\t" /* FIXME Unroll? */\
      94             :         "1:                                                         \n\t"\
      95             :         "movq                      8(%%"FF_REG_d"), %%mm0           \n\t" /* filterCoeff */\
      96             :         "movq                (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\
      97             :         "movq               8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\
      98             :         "add                                $16, %%"FF_REG_d"       \n\t"\
      99             :         "mov                        (%%"FF_REG_d"), %%"FF_REG_S"    \n\t"\
     100             :         "test                         %%"FF_REG_S", %%"FF_REG_S"    \n\t"\
     101             :         "pmulhw                           %%mm0, %%mm2      \n\t"\
     102             :         "pmulhw                           %%mm0, %%mm5      \n\t"\
     103             :         "paddw                            %%mm2, %%mm3      \n\t"\
     104             :         "paddw                            %%mm5, %%mm4      \n\t"\
     105             :         " jnz                                1b             \n\t"\
     106             :         "psraw                               $3, %%mm3      \n\t"\
     107             :         "psraw                               $3, %%mm4      \n\t"\
     108             :         "packuswb                         %%mm4, %%mm3      \n\t"
     109             :         MOVNTQ2 "                         %%mm3, (%1, %%"FF_REG_c")\n\t"
     110             :         "add                          $8, %%"FF_REG_c"      \n\t"\
     111             :         "cmp                          %2, %%"FF_REG_c"      \n\t"\
     112             :         "movq    %%mm6, %%mm3\n\t"
     113             :         "movq    %%mm7, %%mm4\n\t"
     114             :         "mov                                 %0, %%"FF_REG_d"     \n\t"\
     115             :         "mov                        (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     116             :         "jb                                  1b                   \n\t"\
     117             :         :: "g" (filter),
     118           0 :            "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
     119             :         : "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
     120             :     );
     121           0 : }
     122             : 
     123             : #define YSCALEYUV2PACKEDX_UV \
     124             :     __asm__ volatile(\
     125             :         "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\
     126             :         ".p2align                      4                \n\t"\
     127             :         "nop                                            \n\t"\
     128             :         "1:                                             \n\t"\
     129             :         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
     130             :         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     131             :         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
     132             :         "movq                      %%mm3, %%mm4         \n\t"\
     133             :         ".p2align                      4                \n\t"\
     134             :         "2:                                             \n\t"\
     135             :         "movq            8(%%"FF_REG_d"), %%mm0         \n\t" /* filterCoeff */\
     136             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* UsrcData */\
     137             :         "add                          %6, %%"FF_REG_S"  \n\t" \
     138             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm5      \n\t" /* VsrcData */\
     139             :         "add                         $16, %%"FF_REG_d"  \n\t"\
     140             :         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     141             :         "pmulhw                    %%mm0, %%mm2         \n\t"\
     142             :         "pmulhw                    %%mm0, %%mm5         \n\t"\
     143             :         "paddw                     %%mm2, %%mm3         \n\t"\
     144             :         "paddw                     %%mm5, %%mm4         \n\t"\
     145             :         "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
     146             :         " jnz                         2b                \n\t"\
     147             : 
     148             : #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
     149             :     "lea                "offset"(%0), %%"FF_REG_d"  \n\t"\
     150             :     "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     151             :     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
     152             :     "movq                    "#dst1", "#dst2"       \n\t"\
     153             :     ".p2align                      4                \n\t"\
     154             :     "2:                                             \n\t"\
     155             :     "movq            8(%%"FF_REG_d"), "#coeff"      \n\t" /* filterCoeff */\
     156             :     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
     157             :     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
     158             :     "add                         $16, %%"FF_REG_d"  \n\t"\
     159             :     "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     160             :     "pmulhw                 "#coeff", "#src1"       \n\t"\
     161             :     "pmulhw                 "#coeff", "#src2"       \n\t"\
     162             :     "paddw                   "#src1", "#dst1"       \n\t"\
     163             :     "paddw                   "#src2", "#dst2"       \n\t"\
     164             :     "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
     165             :     " jnz                         2b                \n\t"\
     166             : 
     167             : #define YSCALEYUV2PACKEDX \
     168             :     YSCALEYUV2PACKEDX_UV \
     169             :     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
     170             : 
     171             : #define YSCALEYUV2PACKEDX_END                     \
     172             :         :: "r" (&c->redDither),                   \
     173             :             "m" (dummy), "m" (dummy), "m" (dummy),\
     174             :             "r" (dest), "m" (dstW_reg), "m"(uv_off) \
     175             :             NAMED_CONSTRAINTS_ADD(bF8,bFC) \
     176             :         : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S            \
     177             :     );
     178             : 
     179             : #define YSCALEYUV2PACKEDX_ACCURATE_UV \
     180             :     __asm__ volatile(\
     181             :         "xor %%"FF_REG_a", %%"FF_REG_a"                 \n\t"\
     182             :         ".p2align                      4                \n\t"\
     183             :         "nop                                            \n\t"\
     184             :         "1:                                             \n\t"\
     185             :         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\
     186             :         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\
     187             :         "pxor                      %%mm4, %%mm4         \n\t"\
     188             :         "pxor                      %%mm5, %%mm5         \n\t"\
     189             :         "pxor                      %%mm6, %%mm6         \n\t"\
     190             :         "pxor                      %%mm7, %%mm7         \n\t"\
     191             :         ".p2align                      4                \n\t"\
     192             :         "2:                                             \n\t"\
     193             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm0      \n\t" /* UsrcData */\
     194             :         "add                          %6, %%"FF_REG_S"  \n\t" \
     195             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" /* VsrcData */\
     196             :         "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
     197             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm1      \n\t" /* UsrcData */\
     198             :         "movq                      %%mm0, %%mm3         \n\t"\
     199             :         "punpcklwd                 %%mm1, %%mm0         \n\t"\
     200             :         "punpckhwd                 %%mm1, %%mm3         \n\t"\
     201             :         "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1      \n\t" /* filterCoeff */\
     202             :         "pmaddwd                   %%mm1, %%mm0         \n\t"\
     203             :         "pmaddwd                   %%mm1, %%mm3         \n\t"\
     204             :         "paddd                     %%mm0, %%mm4         \n\t"\
     205             :         "paddd                     %%mm3, %%mm5         \n\t"\
     206             :         "add                          %6, %%"FF_REG_S"  \n\t" \
     207             :         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm3      \n\t" /* VsrcData */\
     208             :         "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
     209             :         "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
     210             :         "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
     211             :         "movq                      %%mm2, %%mm0         \n\t"\
     212             :         "punpcklwd                 %%mm3, %%mm2         \n\t"\
     213             :         "punpckhwd                 %%mm3, %%mm0         \n\t"\
     214             :         "pmaddwd                   %%mm1, %%mm2         \n\t"\
     215             :         "pmaddwd                   %%mm1, %%mm0         \n\t"\
     216             :         "paddd                     %%mm2, %%mm6         \n\t"\
     217             :         "paddd                     %%mm0, %%mm7         \n\t"\
     218             :         " jnz                         2b                \n\t"\
     219             :         "psrad                       $16, %%mm4         \n\t"\
     220             :         "psrad                       $16, %%mm5         \n\t"\
     221             :         "psrad                       $16, %%mm6         \n\t"\
     222             :         "psrad                       $16, %%mm7         \n\t"\
     223             :         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
     224             :         "packssdw                  %%mm5, %%mm4         \n\t"\
     225             :         "packssdw                  %%mm7, %%mm6         \n\t"\
     226             :         "paddw                     %%mm0, %%mm4         \n\t"\
     227             :         "paddw                     %%mm0, %%mm6         \n\t"\
     228             :         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
     229             :         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
     230             : 
     231             : #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
     232             :     "lea                "offset"(%0), %%"FF_REG_d"      \n\t"\
     233             :     "mov                 (%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
     234             :     "pxor                      %%mm1, %%mm1         \n\t"\
     235             :     "pxor                      %%mm5, %%mm5         \n\t"\
     236             :     "pxor                      %%mm7, %%mm7         \n\t"\
     237             :     "pxor                      %%mm6, %%mm6         \n\t"\
     238             :     ".p2align                      4                \n\t"\
     239             :     "2:                                             \n\t"\
     240             :     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0       \n\t" /* Y1srcData */\
     241             :     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2       \n\t" /* Y2srcData */\
     242             :     "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\
     243             :     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4       \n\t" /* Y1srcData */\
     244             :     "movq                      %%mm0, %%mm3         \n\t"\
     245             :     "punpcklwd                 %%mm4, %%mm0         \n\t"\
     246             :     "punpckhwd                 %%mm4, %%mm3         \n\t"\
     247             :     "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4     \n\t" /* filterCoeff */\
     248             :     "pmaddwd                   %%mm4, %%mm0         \n\t"\
     249             :     "pmaddwd                   %%mm4, %%mm3         \n\t"\
     250             :     "paddd                     %%mm0, %%mm1         \n\t"\
     251             :     "paddd                     %%mm3, %%mm5         \n\t"\
     252             :     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3   \n\t" /* Y2srcData */\
     253             :     "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
     254             :     "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\
     255             :     "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\
     256             :     "movq                      %%mm2, %%mm0         \n\t"\
     257             :     "punpcklwd                 %%mm3, %%mm2         \n\t"\
     258             :     "punpckhwd                 %%mm3, %%mm0         \n\t"\
     259             :     "pmaddwd                   %%mm4, %%mm2         \n\t"\
     260             :     "pmaddwd                   %%mm4, %%mm0         \n\t"\
     261             :     "paddd                     %%mm2, %%mm7         \n\t"\
     262             :     "paddd                     %%mm0, %%mm6         \n\t"\
     263             :     " jnz                         2b                \n\t"\
     264             :     "psrad                       $16, %%mm1         \n\t"\
     265             :     "psrad                       $16, %%mm5         \n\t"\
     266             :     "psrad                       $16, %%mm7         \n\t"\
     267             :     "psrad                       $16, %%mm6         \n\t"\
     268             :     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
     269             :     "packssdw                  %%mm5, %%mm1         \n\t"\
     270             :     "packssdw                  %%mm6, %%mm7         \n\t"\
     271             :     "paddw                     %%mm0, %%mm1         \n\t"\
     272             :     "paddw                     %%mm0, %%mm7         \n\t"\
     273             :     "movq               "U_TEMP"(%0), %%mm3         \n\t"\
     274             :     "movq               "V_TEMP"(%0), %%mm4         \n\t"\
     275             : 
     276             : #define YSCALEYUV2PACKEDX_ACCURATE \
     277             :     YSCALEYUV2PACKEDX_ACCURATE_UV \
     278             :     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
     279             : 
     280             : #define YSCALEYUV2RGBX \
     281             :     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
     282             :     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
     283             :     "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
     284             :     "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
     285             :     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
     286             :     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
     287             :     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
     288             :     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
     289             :     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
     290             :     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
     291             :     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
     292             :     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
     293             :     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
     294             :     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
     295             :     "paddw           %%mm3, %%mm4       \n\t"\
     296             :     "movq            %%mm2, %%mm0       \n\t"\
     297             :     "movq            %%mm5, %%mm6       \n\t"\
     298             :     "movq            %%mm4, %%mm3       \n\t"\
     299             :     "punpcklwd       %%mm2, %%mm2       \n\t"\
     300             :     "punpcklwd       %%mm5, %%mm5       \n\t"\
     301             :     "punpcklwd       %%mm4, %%mm4       \n\t"\
     302             :     "paddw           %%mm1, %%mm2       \n\t"\
     303             :     "paddw           %%mm1, %%mm5       \n\t"\
     304             :     "paddw           %%mm1, %%mm4       \n\t"\
     305             :     "punpckhwd       %%mm0, %%mm0       \n\t"\
     306             :     "punpckhwd       %%mm6, %%mm6       \n\t"\
     307             :     "punpckhwd       %%mm3, %%mm3       \n\t"\
     308             :     "paddw           %%mm7, %%mm0       \n\t"\
     309             :     "paddw           %%mm7, %%mm6       \n\t"\
     310             :     "paddw           %%mm7, %%mm3       \n\t"\
     311             :     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
     312             :     "packuswb        %%mm0, %%mm2       \n\t"\
     313             :     "packuswb        %%mm6, %%mm5       \n\t"\
     314             :     "packuswb        %%mm3, %%mm4       \n\t"\
     315             : 
     316             : #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
     317             :     "movq       "#b", "#q2"     \n\t" /* B */\
     318             :     "movq       "#r", "#t"      \n\t" /* R */\
     319             :     "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
     320             :     "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
     321             :     "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
     322             :     "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
     323             :     "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
     324             :     "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
     325             :     "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
     326             :     "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
     327             :     "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
     328             :     "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
     329             : \
     330             :     MOVNTQ(   q0,   (dst, index, 4))\
     331             :     MOVNTQ(    b,  8(dst, index, 4))\
     332             :     MOVNTQ(   q2, 16(dst, index, 4))\
     333             :     MOVNTQ(   q3, 24(dst, index, 4))\
     334             : \
     335             :     "add      $8, "#index"      \n\t"\
     336             :     "cmp  "dstw", "#index"      \n\t"\
     337             :     " jb      1b                \n\t"
     338             : #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
     339             : 
     340           0 : static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
     341             :                                    const int16_t **lumSrc, int lumFilterSize,
     342             :                                    const int16_t *chrFilter, const int16_t **chrUSrc,
     343             :                                    const int16_t **chrVSrc,
     344             :                                    int chrFilterSize, const int16_t **alpSrc,
     345             :                                    uint8_t *dest, int dstW, int dstY)
     346             : {
     347           0 :     x86_reg dummy=0;
     348           0 :     x86_reg dstW_reg = dstW;
     349           0 :     x86_reg uv_off = c->uv_offx2;
     350             : 
     351           0 :     if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
     352           0 :         YSCALEYUV2PACKEDX_ACCURATE
     353             :         YSCALEYUV2RGBX
     354             :         "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
     355             :         "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
     356             :         "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
     357             :         YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
     358             :         "movq               "Y_TEMP"(%0), %%mm5         \n\t"
     359             :         "psraw                        $3, %%mm1         \n\t"
     360             :         "psraw                        $3, %%mm7         \n\t"
     361             :         "packuswb                  %%mm7, %%mm1         \n\t"
     362             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
     363           0 :         YSCALEYUV2PACKEDX_END
     364             :     } else {
     365           0 :         YSCALEYUV2PACKEDX_ACCURATE
     366             :         YSCALEYUV2RGBX
     367             :         "pcmpeqd %%mm7, %%mm7 \n\t"
     368             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
     369           0 :         YSCALEYUV2PACKEDX_END
     370             :     }
     371           0 : }
     372             : 
     373           0 : static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
     374             :                                 const int16_t **lumSrc, int lumFilterSize,
     375             :                                 const int16_t *chrFilter, const int16_t **chrUSrc,
     376             :                                 const int16_t **chrVSrc,
     377             :                                 int chrFilterSize, const int16_t **alpSrc,
     378             :                                 uint8_t *dest, int dstW, int dstY)
     379             : {
     380           0 :     x86_reg dummy=0;
     381           0 :     x86_reg dstW_reg = dstW;
     382           0 :     x86_reg uv_off = c->uv_offx2;
     383             : 
     384           0 :     if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
     385           0 :         YSCALEYUV2PACKEDX
     386             :         YSCALEYUV2RGBX
     387             :         YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
     388             :         "psraw                        $3, %%mm1         \n\t"
     389             :         "psraw                        $3, %%mm7         \n\t"
     390             :         "packuswb                  %%mm7, %%mm1         \n\t"
     391             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
     392           0 :         YSCALEYUV2PACKEDX_END
     393             :     } else {
     394           0 :         YSCALEYUV2PACKEDX
     395             :         YSCALEYUV2RGBX
     396             :         "pcmpeqd %%mm7, %%mm7 \n\t"
     397             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
     398           0 :         YSCALEYUV2PACKEDX_END
     399             :     }
     400           0 : }
     401             : 
     402           0 : static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
     403             :                                 const int16_t **lumSrc, int lumFilterSize,
     404             :                                 const int16_t *chrFilter, const int16_t **chrUSrc,
     405             :                                 const int16_t **chrVSrc,
     406             :                                 int chrFilterSize, const int16_t **alpSrc,
     407             :                                 uint8_t *dest, int dstW, int dstY)
     408             : {
     409           0 :     x86_reg dummy=0;
     410           0 :     x86_reg dstW_reg = dstW;
     411           0 :     x86_reg uv_off = c->uv_offx2;
     412             : 
     413           0 :     if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
     414           0 :         YSCALEYUV2PACKEDX
     415             :         YSCALEYUV2RGBX
     416             :         YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
     417             :         "psraw                        $3, %%mm1         \n\t"
     418             :         "psraw                        $3, %%mm7         \n\t"
     419             :         "packuswb                  %%mm7, %%mm1         \n\t"
     420             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
     421           0 :         YSCALEYUV2PACKEDX_END
     422             :     } else {
     423           0 :         YSCALEYUV2PACKEDX
     424             :         YSCALEYUV2RGBX
     425             :         "pcmpeqd %%mm7, %%mm7 \n\t"
     426             :         WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
     427           0 :         YSCALEYUV2PACKEDX_END
     428             :     }
     429           0 : }
     430             : 
     431             : #define REAL_WRITERGB16(dst, dstw, index) \
     432             :     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
     433             :     "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
     434             :     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
     435             :     "psrlq           $3, %%mm2  \n\t"\
     436             : \
     437             :     "movq         %%mm2, %%mm1  \n\t"\
     438             :     "movq         %%mm4, %%mm3  \n\t"\
     439             : \
     440             :     "punpcklbw    %%mm7, %%mm3  \n\t"\
     441             :     "punpcklbw    %%mm5, %%mm2  \n\t"\
     442             :     "punpckhbw    %%mm7, %%mm4  \n\t"\
     443             :     "punpckhbw    %%mm5, %%mm1  \n\t"\
     444             : \
     445             :     "psllq           $3, %%mm3  \n\t"\
     446             :     "psllq           $3, %%mm4  \n\t"\
     447             : \
     448             :     "por          %%mm3, %%mm2  \n\t"\
     449             :     "por          %%mm4, %%mm1  \n\t"\
     450             : \
     451             :     MOVNTQ(%%mm2,  (dst, index, 2))\
     452             :     MOVNTQ(%%mm1, 8(dst, index, 2))\
     453             : \
     454             :     "add             $8, "#index"   \n\t"\
     455             :     "cmp         "dstw", "#index"   \n\t"\
     456             :     " jb             1b             \n\t"
     457             : #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
     458             : 
     459           0 : static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
     460             :                                     const int16_t **lumSrc, int lumFilterSize,
     461             :                                     const int16_t *chrFilter, const int16_t **chrUSrc,
     462             :                                     const int16_t **chrVSrc,
     463             :                                     int chrFilterSize, const int16_t **alpSrc,
     464             :                                     uint8_t *dest, int dstW, int dstY)
     465             : {
     466           0 :     x86_reg dummy=0;
     467           0 :     x86_reg dstW_reg = dstW;
     468           0 :     x86_reg uv_off = c->uv_offx2;
     469             : 
     470           0 :     YSCALEYUV2PACKEDX_ACCURATE
     471             :     YSCALEYUV2RGBX
     472             :     "pxor %%mm7, %%mm7 \n\t"
     473             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     474             : #ifdef DITHER1XBPP
     475             :     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
     476             :     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
     477             :     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
     478             : #endif
     479             :     WRITERGB16(%4, "%5", %%FF_REGa)
     480           0 :     YSCALEYUV2PACKEDX_END
     481           0 : }
     482             : 
     483           0 : static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
     484             :                                  const int16_t **lumSrc, int lumFilterSize,
     485             :                                  const int16_t *chrFilter, const int16_t **chrUSrc,
     486             :                                  const int16_t **chrVSrc,
     487             :                                  int chrFilterSize, const int16_t **alpSrc,
     488             :                                  uint8_t *dest, int dstW, int dstY)
     489             : {
     490           0 :     x86_reg dummy=0;
     491           0 :     x86_reg dstW_reg = dstW;
     492           0 :     x86_reg uv_off = c->uv_offx2;
     493             : 
     494           0 :     YSCALEYUV2PACKEDX
     495             :     YSCALEYUV2RGBX
     496             :     "pxor %%mm7, %%mm7 \n\t"
     497             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     498             : #ifdef DITHER1XBPP
     499             :     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
     500             :     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
     501             :     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
     502             : #endif
     503             :     WRITERGB16(%4, "%5", %%FF_REGa)
     504           0 :     YSCALEYUV2PACKEDX_END
     505           0 : }
     506             : 
     507             : #define REAL_WRITERGB15(dst, dstw, index) \
     508             :     "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
     509             :     "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
     510             :     "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
     511             :     "psrlq           $3, %%mm2  \n\t"\
     512             :     "psrlq           $1, %%mm5  \n\t"\
     513             : \
     514             :     "movq         %%mm2, %%mm1  \n\t"\
     515             :     "movq         %%mm4, %%mm3  \n\t"\
     516             : \
     517             :     "punpcklbw    %%mm7, %%mm3  \n\t"\
     518             :     "punpcklbw    %%mm5, %%mm2  \n\t"\
     519             :     "punpckhbw    %%mm7, %%mm4  \n\t"\
     520             :     "punpckhbw    %%mm5, %%mm1  \n\t"\
     521             : \
     522             :     "psllq           $2, %%mm3  \n\t"\
     523             :     "psllq           $2, %%mm4  \n\t"\
     524             : \
     525             :     "por          %%mm3, %%mm2  \n\t"\
     526             :     "por          %%mm4, %%mm1  \n\t"\
     527             : \
     528             :     MOVNTQ(%%mm2,  (dst, index, 2))\
     529             :     MOVNTQ(%%mm1, 8(dst, index, 2))\
     530             : \
     531             :     "add             $8, "#index"   \n\t"\
     532             :     "cmp         "dstw", "#index"   \n\t"\
     533             :     " jb             1b             \n\t"
     534             : #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
     535             : 
     536           0 : static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
     537             :                                     const int16_t **lumSrc, int lumFilterSize,
     538             :                                     const int16_t *chrFilter, const int16_t **chrUSrc,
     539             :                                     const int16_t **chrVSrc,
     540             :                                     int chrFilterSize, const int16_t **alpSrc,
     541             :                                     uint8_t *dest, int dstW, int dstY)
     542             : {
     543           0 :     x86_reg dummy=0;
     544           0 :     x86_reg dstW_reg = dstW;
     545           0 :     x86_reg uv_off = c->uv_offx2;
     546             : 
     547           0 :     YSCALEYUV2PACKEDX_ACCURATE
     548             :     YSCALEYUV2RGBX
     549             :     "pxor %%mm7, %%mm7 \n\t"
     550             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     551             : #ifdef DITHER1XBPP
     552             :     "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
     553             :     "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
     554             :     "paddusb "RED_DITHER"(%0), %%mm5\n\t"
     555             : #endif
     556             :     WRITERGB15(%4, "%5", %%FF_REGa)
     557           0 :     YSCALEYUV2PACKEDX_END
     558           0 : }
     559             : 
     560           0 : static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
     561             :                                  const int16_t **lumSrc, int lumFilterSize,
     562             :                                  const int16_t *chrFilter, const int16_t **chrUSrc,
     563             :                                  const int16_t **chrVSrc,
     564             :                                  int chrFilterSize, const int16_t **alpSrc,
     565             :                                  uint8_t *dest, int dstW, int dstY)
     566             : {
     567           0 :     x86_reg dummy=0;
     568           0 :     x86_reg dstW_reg = dstW;
     569           0 :     x86_reg uv_off = c->uv_offx2;
     570             : 
     571           0 :     YSCALEYUV2PACKEDX
     572             :     YSCALEYUV2RGBX
     573             :     "pxor %%mm7, %%mm7 \n\t"
     574             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     575             : #ifdef DITHER1XBPP
     576             :     "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
     577             :     "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
     578             :     "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
     579             : #endif
     580             :     WRITERGB15(%4, "%5", %%FF_REGa)
     581           0 :     YSCALEYUV2PACKEDX_END
     582           0 : }
     583             : 
     584             : #define WRITEBGR24MMX(dst, dstw, index) \
     585             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
     586             :     "movq      %%mm2, %%mm1     \n\t" /* B */\
     587             :     "movq      %%mm5, %%mm6     \n\t" /* R */\
     588             :     "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
     589             :     "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
     590             :     "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
     591             :     "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
     592             :     "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
     593             :     "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
     594             :     "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
     595             :     "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
     596             :     "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
     597             :     "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
     598             : \
     599             :     "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
     600             :     "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
     601             :     "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
     602             :     "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
     603             : \
     604             :     "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
     605             :     "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
     606             :     "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
     607             :     "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
     608             : \
     609             :     "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
     610             :     "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
     611             :     "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
     612             :     "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
     613             : \
     614             :     "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
     615             :     "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
     616             :     "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
     617             :     "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
     618             :     MOVNTQ(%%mm0, (dst))\
     619             : \
     620             :     "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
     621             :     "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
     622             :     "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
     623             :     "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
     624             :     MOVNTQ(%%mm6, 8(dst))\
     625             : \
     626             :     "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
     627             :     "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
     628             :     "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
     629             :     MOVNTQ(%%mm5, 16(dst))\
     630             : \
     631             :     "add         $24, "#dst"    \n\t"\
     632             : \
     633             :     "add          $8, "#index"  \n\t"\
     634             :     "cmp      "dstw", "#index"  \n\t"\
     635             :     " jb          1b            \n\t"
     636             : 
     637             : #define WRITEBGR24MMXEXT(dst, dstw, index) \
     638             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
     639             :     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
     640             :     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
     641             :     "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
     642             :     "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
     643             :     "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
     644             : \
     645             :     "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
     646             :     "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
     647             :     "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
     648             : \
     649             :     "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
     650             :     "por    %%mm1, %%mm6        \n\t"\
     651             :     "por    %%mm3, %%mm6        \n\t"\
     652             :     MOVNTQ(%%mm6, (dst))\
     653             : \
     654             :     "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
     655             :     "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
     656             :     "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
     657             :     "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
     658             : \
     659             :     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
     660             :     "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
     661             :     "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
     662             : \
     663             :     "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
     664             :     "por    %%mm3, %%mm6        \n\t"\
     665             :     MOVNTQ(%%mm6, 8(dst))\
     666             : \
     667             :     "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
     668             :     "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
     669             :     "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
     670             : \
     671             :     "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
     672             :     "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
     673             :     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
     674             : \
     675             :     "por    %%mm1, %%mm3        \n\t"\
     676             :     "por    %%mm3, %%mm6        \n\t"\
     677             :     MOVNTQ(%%mm6, 16(dst))\
     678             : \
     679             :     "add      $24, "#dst"       \n\t"\
     680             : \
     681             :     "add       $8, "#index"     \n\t"\
     682             :     "cmp   "dstw", "#index"     \n\t"\
     683             :     " jb       1b               \n\t"
     684             : 
     685             : #if COMPILE_TEMPLATE_MMXEXT
     686             : #undef WRITEBGR24
     687             : #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
     688             : #else
     689             : #undef WRITEBGR24
     690             : #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
     691             : #endif
     692             : 
     693             : #if HAVE_6REGS
     694           0 : static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
     695             :                                    const int16_t **lumSrc, int lumFilterSize,
     696             :                                    const int16_t *chrFilter, const int16_t **chrUSrc,
     697             :                                    const int16_t **chrVSrc,
     698             :                                    int chrFilterSize, const int16_t **alpSrc,
     699             :                                    uint8_t *dest, int dstW, int dstY)
     700             : {
     701           0 :     x86_reg dummy=0;
     702           0 :     x86_reg dstW_reg = dstW;
     703           0 :     x86_reg uv_off = c->uv_offx2;
     704             : 
     705           0 :     YSCALEYUV2PACKEDX_ACCURATE
     706             :     YSCALEYUV2RGBX
     707             :     "pxor %%mm7, %%mm7 \n\t"
     708             :     "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
     709             :     "add %4, %%"FF_REG_c"                        \n\t"
     710             :     WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
     711           0 :     :: "r" (&c->redDither),
     712             :        "m" (dummy), "m" (dummy), "m" (dummy),
     713             :        "r" (dest), "m" (dstW_reg), "m"(uv_off)
     714             :        NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
     715             :     : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
     716             :     );
     717           0 : }
     718             : 
     719           0 : static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
     720             :                                 const int16_t **lumSrc, int lumFilterSize,
     721             :                                 const int16_t *chrFilter, const int16_t **chrUSrc,
     722             :                                 const int16_t **chrVSrc,
     723             :                                 int chrFilterSize, const int16_t **alpSrc,
     724             :                                 uint8_t *dest, int dstW, int dstY)
     725             : {
     726           0 :     x86_reg dummy=0;
     727           0 :     x86_reg dstW_reg = dstW;
     728           0 :     x86_reg uv_off = c->uv_offx2;
     729             : 
     730           0 :     YSCALEYUV2PACKEDX
     731             :     YSCALEYUV2RGBX
     732             :     "pxor                    %%mm7, %%mm7              \n\t"
     733             :     "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
     734             :     "add                        %4, %%"FF_REG_c"       \n\t"
     735             :     WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
     736           0 :     :: "r" (&c->redDither),
     737             :        "m" (dummy), "m" (dummy), "m" (dummy),
     738             :        "r" (dest),  "m" (dstW_reg), "m"(uv_off)
     739             :        NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
     740             :     : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
     741             :     );
     742           0 : }
     743             : #endif /* HAVE_6REGS */
     744             : 
     745             : #define REAL_WRITEYUY2(dst, dstw, index) \
     746             :     "packuswb  %%mm3, %%mm3     \n\t"\
     747             :     "packuswb  %%mm4, %%mm4     \n\t"\
     748             :     "packuswb  %%mm7, %%mm1     \n\t"\
     749             :     "punpcklbw %%mm4, %%mm3     \n\t"\
     750             :     "movq      %%mm1, %%mm7     \n\t"\
     751             :     "punpcklbw %%mm3, %%mm1     \n\t"\
     752             :     "punpckhbw %%mm3, %%mm7     \n\t"\
     753             : \
     754             :     MOVNTQ(%%mm1, (dst, index, 2))\
     755             :     MOVNTQ(%%mm7, 8(dst, index, 2))\
     756             : \
     757             :     "add          $8, "#index"  \n\t"\
     758             :     "cmp      "dstw", "#index"  \n\t"\
     759             :     " jb          1b            \n\t"
     760             : #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
     761             : 
     762           0 : static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
     763             :                                      const int16_t **lumSrc, int lumFilterSize,
     764             :                                      const int16_t *chrFilter, const int16_t **chrUSrc,
     765             :                                      const int16_t **chrVSrc,
     766             :                                      int chrFilterSize, const int16_t **alpSrc,
     767             :                                      uint8_t *dest, int dstW, int dstY)
     768             : {
     769           0 :     x86_reg dummy=0;
     770           0 :     x86_reg dstW_reg = dstW;
     771           0 :     x86_reg uv_off = c->uv_offx2;
     772             : 
     773           0 :     YSCALEYUV2PACKEDX_ACCURATE
     774             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     775             :     "psraw $3, %%mm3    \n\t"
     776             :     "psraw $3, %%mm4    \n\t"
     777             :     "psraw $3, %%mm1    \n\t"
     778             :     "psraw $3, %%mm7    \n\t"
     779             :     WRITEYUY2(%4, "%5", %%FF_REGa)
     780           0 :     YSCALEYUV2PACKEDX_END
     781           0 : }
     782             : 
     783           0 : static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
     784             :                                   const int16_t **lumSrc, int lumFilterSize,
     785             :                                   const int16_t *chrFilter, const int16_t **chrUSrc,
     786             :                                   const int16_t **chrVSrc,
     787             :                                   int chrFilterSize, const int16_t **alpSrc,
     788             :                                   uint8_t *dest, int dstW, int dstY)
     789             : {
     790           0 :     x86_reg dummy=0;
     791           0 :     x86_reg dstW_reg = dstW;
     792           0 :     x86_reg uv_off = c->uv_offx2;
     793             : 
     794           0 :     YSCALEYUV2PACKEDX
     795             :     /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     796             :     "psraw $3, %%mm3    \n\t"
     797             :     "psraw $3, %%mm4    \n\t"
     798             :     "psraw $3, %%mm1    \n\t"
     799             :     "psraw $3, %%mm7    \n\t"
     800             :     WRITEYUY2(%4, "%5", %%FF_REGa)
     801           0 :     YSCALEYUV2PACKEDX_END
     802           0 : }
     803             : 
     804             : #define REAL_YSCALEYUV2RGB_UV(index, c) \
     805             :     "xor            "#index", "#index"  \n\t"\
     806             :     ".p2align              4            \n\t"\
     807             :     "1:                                 \n\t"\
     808             :     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
     809             :     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
     810             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     811             :     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
     812             :     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
     813             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
     814             :     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
     815             :     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
     816             :     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
     817             :     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
     818             :     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
     819             :     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
     820             :     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
     821             :     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
     822             :     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
     823             :     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
     824             :     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
     825             :     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
     826             :     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
     827             :     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
     828             :     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
     829             :     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
     830             : 
     831             : #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
     832             :     "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
     833             :     "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
     834             :     "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
     835             :     "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
     836             :     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
     837             :     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
     838             :     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
     839             :     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
     840             :     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
     841             :     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
     842             :     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
     843             :     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
     844             : 
     845             : #define REAL_YSCALEYUV2RGB_COEFF(c) \
     846             :     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
     847             :     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
     848             :     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
     849             :     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
     850             :     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
     851             :     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
     852             :     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
     853             :     "paddw             %%mm3, %%mm4     \n\t"\
     854             :     "movq              %%mm2, %%mm0     \n\t"\
     855             :     "movq              %%mm5, %%mm6     \n\t"\
     856             :     "movq              %%mm4, %%mm3     \n\t"\
     857             :     "punpcklwd         %%mm2, %%mm2     \n\t"\
     858             :     "punpcklwd         %%mm5, %%mm5     \n\t"\
     859             :     "punpcklwd         %%mm4, %%mm4     \n\t"\
     860             :     "paddw             %%mm1, %%mm2     \n\t"\
     861             :     "paddw             %%mm1, %%mm5     \n\t"\
     862             :     "paddw             %%mm1, %%mm4     \n\t"\
     863             :     "punpckhwd         %%mm0, %%mm0     \n\t"\
     864             :     "punpckhwd         %%mm6, %%mm6     \n\t"\
     865             :     "punpckhwd         %%mm3, %%mm3     \n\t"\
     866             :     "paddw             %%mm7, %%mm0     \n\t"\
     867             :     "paddw             %%mm7, %%mm6     \n\t"\
     868             :     "paddw             %%mm7, %%mm3     \n\t"\
     869             :     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
     870             :     "packuswb          %%mm0, %%mm2     \n\t"\
     871             :     "packuswb          %%mm6, %%mm5     \n\t"\
     872             :     "packuswb          %%mm3, %%mm4     \n\t"\
     873             : 
     874             : #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
     875             : 
     876             : #define YSCALEYUV2RGB(index, c) \
     877             :     REAL_YSCALEYUV2RGB_UV(index, c) \
     878             :     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
     879             :     REAL_YSCALEYUV2RGB_COEFF(c)
     880             : 
     881             : /**
     882             :  * vertical bilinear scale YV12 to RGB
     883             :  */
     884           0 : static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
     885             :                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
     886             :                                 const int16_t *abuf[2], uint8_t *dest,
     887             :                                 int dstW, int yalpha, int uvalpha, int y)
     888             : {
     889           0 :     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
     890           0 :                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
     891             : 
     892           0 :     if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
     893           0 :         const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
     894             : #if ARCH_X86_64
     895           0 :         __asm__ volatile(
     896             :             YSCALEYUV2RGB(%%r8, %5)
     897             :             YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
     898             :             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
     899             :             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
     900             :             "packuswb            %%mm7, %%mm1       \n\t"
     901             :             WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
     902             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
     903           0 :                "a" (&c->redDither),
     904             :                "r" (abuf0), "r" (abuf1)
     905             :             : "%r8"
     906             :         );
     907             : #else
     908             :         c->u_temp=(intptr_t)abuf0;
     909             :         c->v_temp=(intptr_t)abuf1;
     910             :         __asm__ volatile(
     911             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
     912             :             "mov        %4, %%"FF_REG_b"            \n\t"
     913             :             "push %%"FF_REG_BP"                     \n\t"
     914             :             YSCALEYUV2RGB(%%FF_REGBP, %5)
     915             :             "push                   %0              \n\t"
     916             :             "push                   %1              \n\t"
     917             :             "mov          "U_TEMP"(%5), %0          \n\t"
     918             :             "mov          "V_TEMP"(%5), %1          \n\t"
     919             :             YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
     920             :             "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
     921             :             "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
     922             :             "packuswb            %%mm7, %%mm1       \n\t"
     923             :             "pop                    %1              \n\t"
     924             :             "pop                    %0              \n\t"
     925             :             WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
     926             :             "pop %%"FF_REG_BP"                      \n\t"
     927             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
     928             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
     929             :                "a" (&c->redDither)
     930             :         );
     931             : #endif
     932             :     } else {
     933           0 :         __asm__ volatile(
     934             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
     935             :             "mov        %4, %%"FF_REG_b"            \n\t"
     936             :             "push %%"FF_REG_BP"                     \n\t"
     937             :             YSCALEYUV2RGB(%%FF_REGBP, %5)
     938             :             "pcmpeqd %%mm7, %%mm7                   \n\t"
     939             :             WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
     940             :             "pop %%"FF_REG_BP"                      \n\t"
     941             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
     942             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
     943           0 :                "a" (&c->redDither)
     944             :         );
     945             :     }
     946           0 : }
     947             : 
     948           0 : static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
     949             :                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
     950             :                                 const int16_t *abuf[2], uint8_t *dest,
     951             :                                 int dstW, int yalpha, int uvalpha, int y)
     952             : {
     953           0 :     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
     954           0 :                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
     955             : 
     956           0 :     __asm__ volatile(
     957             :         "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
     958             :         "mov           %4, %%"FF_REG_b"         \n\t"
     959             :         "push %%"FF_REG_BP"                     \n\t"
     960             :         YSCALEYUV2RGB(%%FF_REGBP, %5)
     961             :         "pxor    %%mm7, %%mm7                   \n\t"
     962             :         WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
     963             :         "pop %%"FF_REG_BP"                      \n\t"
     964             :         "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
     965             :         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
     966           0 :            "a" (&c->redDither)
     967             :            NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
     968             :     );
     969           0 : }
     970             : 
     971           0 : static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
     972             :                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
     973             :                                  const int16_t *abuf[2], uint8_t *dest,
     974             :                                  int dstW, int yalpha, int uvalpha, int y)
     975             : {
     976           0 :     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
     977           0 :                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
     978             : 
     979           0 :     __asm__ volatile(
     980             :         "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
     981             :         "mov        %4, %%"FF_REG_b"            \n\t"
     982             :         "push %%"FF_REG_BP"                     \n\t"
     983             :         YSCALEYUV2RGB(%%FF_REGBP, %5)
     984             :         "pxor    %%mm7, %%mm7                   \n\t"
     985             :         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
     986             : #ifdef DITHER1XBPP
     987             :         "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
     988             :         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
     989             :         "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
     990             : #endif
     991             :         WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
     992             :         "pop %%"FF_REG_BP"                      \n\t"
     993             :         "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
     994             :         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
     995           0 :            "a" (&c->redDither)
     996             :            NAMED_CONSTRAINTS_ADD(bF8)
     997             :     );
     998           0 : }
     999             : 
    1000           0 : static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
    1001             :                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
    1002             :                                  const int16_t *abuf[2], uint8_t *dest,
    1003             :                                  int dstW, int yalpha, int uvalpha, int y)
    1004             : {
    1005           0 :     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
    1006           0 :                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
    1007             : 
    1008           0 :     __asm__ volatile(
    1009             :         "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1010             :         "mov           %4, %%"FF_REG_b"         \n\t"
    1011             :         "push %%"FF_REG_BP"                     \n\t"
    1012             :         YSCALEYUV2RGB(%%FF_REGBP, %5)
    1013             :         "pxor    %%mm7, %%mm7                   \n\t"
    1014             :         /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    1015             : #ifdef DITHER1XBPP
    1016             :         "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
    1017             :         "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
    1018             :         "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
    1019             : #endif
    1020             :         WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1021             :         "pop %%"FF_REG_BP"                      \n\t"
    1022             :         "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1023             :         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1024           0 :            "a" (&c->redDither)
    1025             :            NAMED_CONSTRAINTS_ADD(bF8,bFC)
    1026             :     );
    1027           0 : }
    1028             : 
    1029             : #define REAL_YSCALEYUV2PACKED(index, c) \
    1030             :     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
    1031             :     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
    1032             :     "psraw                $3, %%mm0                           \n\t"\
    1033             :     "psraw                $3, %%mm1                           \n\t"\
    1034             :     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
    1035             :     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
    1036             :     "xor            "#index", "#index"                        \n\t"\
    1037             :     ".p2align              4            \n\t"\
    1038             :     "1:                                 \n\t"\
    1039             :     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    1040             :     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
    1041             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1042             :     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    1043             :     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
    1044             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1045             :     "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
    1046             :     "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
    1047             :     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
    1048             :     "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
    1049             :     "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
    1050             :     "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
    1051             :     "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
    1052             :     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
    1053             :     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
    1054             :     "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
    1055             :     "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
    1056             :     "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
    1057             :     "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
    1058             :     "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
    1059             :     "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
    1060             :     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    1061             :     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
    1062             :     "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1063             :     "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1064             :     "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
    1065             :     "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
    1066             : 
    1067             : #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
    1068             : 
    1069           0 : static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
    1070             :                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
    1071             :                                   const int16_t *abuf[2], uint8_t *dest,
    1072             :                                   int dstW, int yalpha, int uvalpha, int y)
    1073             : {
    1074           0 :     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
    1075           0 :                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
    1076             : 
    1077           0 :     __asm__ volatile(
    1078             :         "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1079             :         "mov           %4, %%"FF_REG_b"         \n\t"
    1080             :         "push %%"FF_REG_BP"                     \n\t"
    1081             :         YSCALEYUV2PACKED(%%FF_REGBP, %5)
    1082             :         WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1083             :         "pop %%"FF_REG_BP"                      \n\t"
    1084             :         "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1085             :         :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1086           0 :            "a" (&c->redDither)
    1087             :     );
    1088           0 : }
    1089             : 
    1090             : #define REAL_YSCALEYUV2RGB1(index, c) \
    1091             :     "xor            "#index", "#index"  \n\t"\
    1092             :     ".p2align              4            \n\t"\
    1093             :     "1:                                 \n\t"\
    1094             :     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
    1095             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1096             :     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
    1097             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1098             :     "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
    1099             :     "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
    1100             :     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
    1101             :     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
    1102             :     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
    1103             :     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
    1104             :     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
    1105             :     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
    1106             :     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
    1107             :     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    1108             :     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    1109             :     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1110             :     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1111             :     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
    1112             :     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
    1113             :     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
    1114             :     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
    1115             :     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
    1116             :     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
    1117             :     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    1118             :     "paddw             %%mm3, %%mm4     \n\t"\
    1119             :     "movq              %%mm2, %%mm0     \n\t"\
    1120             :     "movq              %%mm5, %%mm6     \n\t"\
    1121             :     "movq              %%mm4, %%mm3     \n\t"\
    1122             :     "punpcklwd         %%mm2, %%mm2     \n\t"\
    1123             :     "punpcklwd         %%mm5, %%mm5     \n\t"\
    1124             :     "punpcklwd         %%mm4, %%mm4     \n\t"\
    1125             :     "paddw             %%mm1, %%mm2     \n\t"\
    1126             :     "paddw             %%mm1, %%mm5     \n\t"\
    1127             :     "paddw             %%mm1, %%mm4     \n\t"\
    1128             :     "punpckhwd         %%mm0, %%mm0     \n\t"\
    1129             :     "punpckhwd         %%mm6, %%mm6     \n\t"\
    1130             :     "punpckhwd         %%mm3, %%mm3     \n\t"\
    1131             :     "paddw             %%mm7, %%mm0     \n\t"\
    1132             :     "paddw             %%mm7, %%mm6     \n\t"\
    1133             :     "paddw             %%mm7, %%mm3     \n\t"\
    1134             :     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    1135             :     "packuswb          %%mm0, %%mm2     \n\t"\
    1136             :     "packuswb          %%mm6, %%mm5     \n\t"\
    1137             :     "packuswb          %%mm3, %%mm4     \n\t"\
    1138             : 
    1139             : #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
    1140             : 
    1141             : // do vertical chrominance interpolation
    1142             : #define REAL_YSCALEYUV2RGB1b(index, c) \
    1143             :     "xor            "#index", "#index"  \n\t"\
    1144             :     ".p2align              4            \n\t"\
    1145             :     "1:                                 \n\t"\
    1146             :     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    1147             :     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
    1148             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1149             :     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    1150             :     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
    1151             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1152             :     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
    1153             :     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
    1154             :     "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
    1155             :     "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
    1156             :     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
    1157             :     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
    1158             :     "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
    1159             :     "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
    1160             :     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
    1161             :     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
    1162             :     /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
    1163             :     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    1164             :     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    1165             :     "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1166             :     "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
    1167             :     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
    1168             :     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
    1169             :     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
    1170             :     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
    1171             :     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
    1172             :     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
    1173             :     /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
    1174             :     "paddw             %%mm3, %%mm4     \n\t"\
    1175             :     "movq              %%mm2, %%mm0     \n\t"\
    1176             :     "movq              %%mm5, %%mm6     \n\t"\
    1177             :     "movq              %%mm4, %%mm3     \n\t"\
    1178             :     "punpcklwd         %%mm2, %%mm2     \n\t"\
    1179             :     "punpcklwd         %%mm5, %%mm5     \n\t"\
    1180             :     "punpcklwd         %%mm4, %%mm4     \n\t"\
    1181             :     "paddw             %%mm1, %%mm2     \n\t"\
    1182             :     "paddw             %%mm1, %%mm5     \n\t"\
    1183             :     "paddw             %%mm1, %%mm4     \n\t"\
    1184             :     "punpckhwd         %%mm0, %%mm0     \n\t"\
    1185             :     "punpckhwd         %%mm6, %%mm6     \n\t"\
    1186             :     "punpckhwd         %%mm3, %%mm3     \n\t"\
    1187             :     "paddw             %%mm7, %%mm0     \n\t"\
    1188             :     "paddw             %%mm7, %%mm6     \n\t"\
    1189             :     "paddw             %%mm7, %%mm3     \n\t"\
    1190             :     /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
    1191             :     "packuswb          %%mm0, %%mm2     \n\t"\
    1192             :     "packuswb          %%mm6, %%mm5     \n\t"\
    1193             :     "packuswb          %%mm3, %%mm4     \n\t"\
    1194             : 
    1195             : #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
    1196             : 
    1197             : #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
    1198             :     "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
    1199             :     "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
    1200             :     "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
    1201             :     "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
    1202             :     "packuswb          %%mm1, %%mm7     \n\t"
    1203             : #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
    1204             : 
    1205             : /**
    1206             :  * YV12 to RGB without scaling or interpolating
    1207             :  */
    1208           0 : static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
    1209             :                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
    1210             :                                 const int16_t *abuf0, uint8_t *dest,
    1211             :                                 int dstW, int uvalpha, int y)
    1212             : {
    1213           0 :     const int16_t *ubuf0 = ubuf[0];
    1214           0 :     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    1215             : 
    1216           0 :     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
    1217           0 :         const int16_t *ubuf1 = ubuf[0];
    1218           0 :         if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
    1219           0 :             __asm__ volatile(
    1220             :                 "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1221             :                 "mov           %4, %%"FF_REG_b"         \n\t"
    1222             :                 "push %%"FF_REG_BP"                     \n\t"
    1223             :                 YSCALEYUV2RGB1(%%FF_REGBP, %5)
    1224             :                 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
    1225             :                 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
    1226             :                 "pop %%"FF_REG_BP"                      \n\t"
    1227             :                 "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1228             :                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1229           0 :                    "a" (&c->redDither)
    1230             :             );
    1231             :         } else {
    1232           0 :             __asm__ volatile(
    1233             :                 "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1234             :                 "mov           %4, %%"FF_REG_b"         \n\t"
    1235             :                 "push %%"FF_REG_BP"                     \n\t"
    1236             :                 YSCALEYUV2RGB1(%%FF_REGBP, %5)
    1237             :                 "pcmpeqd %%mm7, %%mm7                   \n\t"
    1238             :                 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
    1239             :                 "pop %%"FF_REG_BP"                      \n\t"
    1240             :                 "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1241             :                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1242           0 :                    "a" (&c->redDither)
    1243             :             );
    1244             :         }
    1245             :     } else {
    1246           0 :         const int16_t *ubuf1 = ubuf[1];
    1247           0 :         if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
    1248           0 :             __asm__ volatile(
    1249             :                 "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1250             :                 "mov           %4, %%"FF_REG_b"         \n\t"
    1251             :                 "push %%"FF_REG_BP"                     \n\t"
    1252             :                 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
    1253             :                 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
    1254             :                 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
    1255             :                 "pop %%"FF_REG_BP"                      \n\t"
    1256             :                 "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1257             :                 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1258           0 :                    "a" (&c->redDither)
    1259             :             );
    1260             :         } else {
    1261           0 :             __asm__ volatile(
    1262             :                 "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1263             :                 "mov           %4, %%"FF_REG_b"         \n\t"
    1264             :                 "push %%"FF_REG_BP"                     \n\t"
    1265             :                 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
    1266             :                 "pcmpeqd %%mm7, %%mm7                   \n\t"
    1267             :                 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
    1268             :                 "pop %%"FF_REG_BP"                      \n\t"
    1269             :                 "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1270             :                 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1271           0 :                    "a" (&c->redDither)
    1272             :             );
    1273             :         }
    1274             :     }
    1275           0 : }
    1276             : 
    1277           0 : static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
    1278             :                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
    1279             :                                 const int16_t *abuf0, uint8_t *dest,
    1280             :                                 int dstW, int uvalpha, int y)
    1281             : {
    1282           0 :     const int16_t *ubuf0 = ubuf[0];
    1283           0 :     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    1284             : 
    1285           0 :     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
    1286           0 :         const int16_t *ubuf1 = ubuf[0];
    1287           0 :         __asm__ volatile(
    1288             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1289             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1290             :             "push %%"FF_REG_BP"                     \n\t"
    1291             :             YSCALEYUV2RGB1(%%FF_REGBP, %5)
    1292             :             "pxor    %%mm7, %%mm7                   \n\t"
    1293             :             WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1294             :             "pop %%"FF_REG_BP"                      \n\t"
    1295             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1296             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1297           0 :                "a" (&c->redDither)
    1298             :                NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
    1299             :         );
    1300             :     } else {
    1301           0 :         const int16_t *ubuf1 = ubuf[1];
    1302           0 :         __asm__ volatile(
    1303             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1304             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1305             :             "push %%"FF_REG_BP"                     \n\t"
    1306             :             YSCALEYUV2RGB1b(%%FF_REGBP, %5)
    1307             :             "pxor    %%mm7, %%mm7                   \n\t"
    1308             :             WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1309             :             "pop %%"FF_REG_BP"                      \n\t"
    1310             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1311             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1312           0 :                "a" (&c->redDither)
    1313             :                NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
    1314             :         );
    1315             :     }
    1316           0 : }
    1317             : 
    1318           0 : static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
    1319             :                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
    1320             :                                  const int16_t *abuf0, uint8_t *dest,
    1321             :                                  int dstW, int uvalpha, int y)
    1322             : {
    1323           0 :     const int16_t *ubuf0 = ubuf[0];
    1324           0 :     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    1325             : 
    1326           0 :     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
    1327           0 :         const int16_t *ubuf1 = ubuf[0];
    1328           0 :         __asm__ volatile(
    1329             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1330             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1331             :             "push %%"FF_REG_BP"                     \n\t"
    1332             :             YSCALEYUV2RGB1(%%FF_REGBP, %5)
    1333             :             "pxor    %%mm7, %%mm7                   \n\t"
    1334             :             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    1335             : #ifdef DITHER1XBPP
    1336             :             "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
    1337             :             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
    1338             :             "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
    1339             : #endif
    1340             :             WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1341             :             "pop %%"FF_REG_BP"                      \n\t"
    1342             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1343             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1344           0 :                "a" (&c->redDither)
    1345             :                NAMED_CONSTRAINTS_ADD(bF8)
    1346             :         );
    1347             :     } else {
    1348           0 :         const int16_t *ubuf1 = ubuf[1];
    1349           0 :         __asm__ volatile(
    1350             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1351             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1352             :             "push %%"FF_REG_BP"                     \n\t"
    1353             :             YSCALEYUV2RGB1b(%%FF_REGBP, %5)
    1354             :             "pxor    %%mm7, %%mm7                   \n\t"
    1355             :             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    1356             : #ifdef DITHER1XBPP
    1357             :             "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
    1358             :             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
    1359             :             "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
    1360             : #endif
    1361             :             WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1362             :             "pop %%"FF_REG_BP"                      \n\t"
    1363             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1364             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1365           0 :                "a" (&c->redDither)
    1366             :                NAMED_CONSTRAINTS_ADD(bF8)
    1367             :         );
    1368             :     }
    1369           0 : }
    1370             : 
    1371           0 : static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
    1372             :                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
    1373             :                                  const int16_t *abuf0, uint8_t *dest,
    1374             :                                  int dstW, int uvalpha, int y)
    1375             : {
    1376           0 :     const int16_t *ubuf0 = ubuf[0];
    1377           0 :     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    1378             : 
    1379           0 :     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
    1380           0 :         const int16_t *ubuf1 = ubuf[0];
    1381           0 :         __asm__ volatile(
    1382             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1383             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1384             :             "push %%"FF_REG_BP"                     \n\t"
    1385             :             YSCALEYUV2RGB1(%%FF_REGBP, %5)
    1386             :             "pxor    %%mm7, %%mm7                   \n\t"
    1387             :             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    1388             : #ifdef DITHER1XBPP
    1389             :             "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
    1390             :             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
    1391             :             "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
    1392             : #endif
    1393             :             WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1394             :             "pop %%"FF_REG_BP"                      \n\t"
    1395             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1396             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1397           0 :                "a" (&c->redDither)
    1398             :                NAMED_CONSTRAINTS_ADD(bF8,bFC)
    1399             :         );
    1400             :     } else {
    1401           0 :         const int16_t *ubuf1 = ubuf[1];
    1402           0 :         __asm__ volatile(
    1403             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1404             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1405             :             "push %%"FF_REG_BP"                     \n\t"
    1406             :             YSCALEYUV2RGB1b(%%FF_REGBP, %5)
    1407             :             "pxor    %%mm7, %%mm7                   \n\t"
    1408             :             /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
    1409             : #ifdef DITHER1XBPP
    1410             :             "paddusb "BLUE_DITHER"(%5), %%mm2       \n\t"
    1411             :             "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
    1412             :             "paddusb "RED_DITHER"(%5), %%mm5        \n\t"
    1413             : #endif
    1414             :             WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1415             :             "pop %%"FF_REG_BP"                      \n\t"
    1416             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1417             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1418           0 :                "a" (&c->redDither)
    1419             :                NAMED_CONSTRAINTS_ADD(bF8,bFC)
    1420             :         );
    1421             :     }
    1422           0 : }
    1423             : 
    1424             : #define REAL_YSCALEYUV2PACKED1(index, c) \
    1425             :     "xor            "#index", "#index"  \n\t"\
    1426             :     ".p2align              4            \n\t"\
    1427             :     "1:                                 \n\t"\
    1428             :     "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
    1429             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1430             :     "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
    1431             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1432             :     "psraw                $7, %%mm3     \n\t" \
    1433             :     "psraw                $7, %%mm4     \n\t" \
    1434             :     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    1435             :     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    1436             :     "psraw                $7, %%mm1     \n\t" \
    1437             :     "psraw                $7, %%mm7     \n\t" \
    1438             : 
    1439             : #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
    1440             : 
    1441             : #define REAL_YSCALEYUV2PACKED1b(index, c) \
    1442             :     "xor "#index", "#index"             \n\t"\
    1443             :     ".p2align              4            \n\t"\
    1444             :     "1:                                 \n\t"\
    1445             :     "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
    1446             :     "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
    1447             :     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1448             :     "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
    1449             :     "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
    1450             :     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
    1451             :     "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
    1452             :     "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
    1453             :     "psrlw                $8, %%mm3     \n\t" \
    1454             :     "psrlw                $8, %%mm4     \n\t" \
    1455             :     "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
    1456             :     "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
    1457             :     "psraw                $7, %%mm1     \n\t" \
    1458             :     "psraw                $7, %%mm7     \n\t"
    1459             : #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
    1460             : 
    1461           0 : static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
    1462             :                                   const int16_t *ubuf[2], const int16_t *vbuf[2],
    1463             :                                   const int16_t *abuf0, uint8_t *dest,
    1464             :                                   int dstW, int uvalpha, int y)
    1465             : {
    1466           0 :     const int16_t *ubuf0 = ubuf[0];
    1467           0 :     const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
    1468             : 
    1469           0 :     if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
    1470           0 :         const int16_t *ubuf1 = ubuf[0];
    1471           0 :         __asm__ volatile(
    1472             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1473             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1474             :             "push %%"FF_REG_BP"                     \n\t"
    1475             :             YSCALEYUV2PACKED1(%%FF_REGBP, %5)
    1476             :             WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1477             :             "pop %%"FF_REG_BP"                      \n\t"
    1478             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1479             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1480           0 :                "a" (&c->redDither)
    1481             :         );
    1482             :     } else {
    1483           0 :         const int16_t *ubuf1 = ubuf[1];
    1484           0 :         __asm__ volatile(
    1485             :             "mov %%"FF_REG_b", "ESP_OFFSET"(%5)     \n\t"
    1486             :             "mov           %4, %%"FF_REG_b"         \n\t"
    1487             :             "push %%"FF_REG_BP"                     \n\t"
    1488             :             YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
    1489             :             WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
    1490             :             "pop %%"FF_REG_BP"                      \n\t"
    1491             :             "mov "ESP_OFFSET"(%5), %%"FF_REG_b"     \n\t"
    1492             :             :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
    1493           0 :                "a" (&c->redDither)
    1494             :         );
    1495             :     }
    1496           0 : }
    1497           6 : static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
    1498             : {
    1499           6 :     enum AVPixelFormat dstFormat = c->dstFormat;
    1500             : 
    1501           6 :     c->use_mmx_vfilter= 0;
    1502           6 :     if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
    1503           6 :         && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
    1504           0 :             if (c->flags & SWS_ACCURATE_RND) {
    1505           0 :                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
    1506           0 :                     switch (c->dstFormat) {
    1507           0 :                     case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
    1508             : #if HAVE_6REGS
    1509           0 :                     case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
    1510             : #endif
    1511           0 :                     case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
    1512           0 :                     case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
    1513           0 :                     case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
    1514           0 :                     default: break;
    1515             :                     }
    1516             :                 }
    1517             :             } else {
    1518           0 :                 c->use_mmx_vfilter= 1;
    1519           0 :                 c->yuv2planeX = RENAME(yuv2yuvX    );
    1520           0 :                 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
    1521           0 :                     switch (c->dstFormat) {
    1522           0 :                     case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
    1523           0 :                     case AV_PIX_FMT_BGR32:   c->yuv2packedX = RENAME(yuv2bgr32_X);   break;
    1524             : #if HAVE_6REGS
    1525           0 :                     case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
    1526             : #endif
    1527           0 :                     case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
    1528           0 :                     case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
    1529           0 :                     case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
    1530           0 :                     default: break;
    1531             :                     }
    1532             :                 }
    1533             :             }
    1534           0 :         if (!(c->flags & SWS_FULL_CHR_H_INT)) {
    1535           0 :             switch (c->dstFormat) {
    1536             :             case AV_PIX_FMT_RGB32:
    1537           0 :                 c->yuv2packed1 = RENAME(yuv2rgb32_1);
    1538           0 :                 c->yuv2packed2 = RENAME(yuv2rgb32_2);
    1539           0 :                 break;
    1540             :             case AV_PIX_FMT_BGR24:
    1541           0 :                 c->yuv2packed1 = RENAME(yuv2bgr24_1);
    1542           0 :                 c->yuv2packed2 = RENAME(yuv2bgr24_2);
    1543           0 :                 break;
    1544             :             case AV_PIX_FMT_RGB555:
    1545           0 :                 c->yuv2packed1 = RENAME(yuv2rgb555_1);
    1546           0 :                 c->yuv2packed2 = RENAME(yuv2rgb555_2);
    1547           0 :                 break;
    1548             :             case AV_PIX_FMT_RGB565:
    1549           0 :                 c->yuv2packed1 = RENAME(yuv2rgb565_1);
    1550           0 :                 c->yuv2packed2 = RENAME(yuv2rgb565_2);
    1551           0 :                 break;
    1552             :             case AV_PIX_FMT_YUYV422:
    1553           0 :                 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
    1554           0 :                 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
    1555           0 :                 break;
    1556             :             default:
    1557           0 :                 break;
    1558             :             }
    1559             :         }
    1560             :     }
    1561             : 
    1562           6 :     if (c->srcBpc == 8 && c->dstBpc <= 14) {
    1563             :     // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
    1564             : #if COMPILE_TEMPLATE_MMXEXT
    1565           2 :     if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
    1566           0 :         c->hyscale_fast = ff_hyscale_fast_mmxext;
    1567           0 :         c->hcscale_fast = ff_hcscale_fast_mmxext;
    1568             :     } else {
    1569             : #endif /* COMPILE_TEMPLATE_MMXEXT */
    1570           4 :         c->hyscale_fast = NULL;
    1571           4 :         c->hcscale_fast = NULL;
    1572             : #if COMPILE_TEMPLATE_MMXEXT
    1573             :     }
    1574             : #endif /* COMPILE_TEMPLATE_MMXEXT */
    1575             :     }
    1576           6 : }

Generated by: LCOV version 1.12