FFmpeg coverage

Directory:	../../../ffmpeg/
File:	src/libavcodec/x86/fdct.c
Date:	2026-05-02 21:46:34

	Exec	Total	Coverage
Lines:	9	9	100.0%
Functions:	3	3	100.0%
Branches:	0	0	-%

  
      Line
      Branch
      Exec
      Source
    
      /*
    
       * SIMD-optimized forward DCT
    
       * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
    
       * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
    
       * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
    
       *
    
       * from  fdctam32.c - AP922 MMX(3D-Now) forward-DCT
    
       *
    
       *  Intel Application Note AP-922 - fast, precise implementation of DCT
    
       *        http://developer.intel.com/vtune/cbts/appnotes.htm
    
       *
    
       * Also of inspiration:
    
       * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
    
       * Skal's fdct at http://skal.planet-d.net/coding/dct.html
    
       *
    
       * This file is part of FFmpeg.
    
       *
    
       * FFmpeg is free software; you can redistribute it and/or
    
       * modify it under the terms of the GNU Lesser General Public
    
       * License as published by the Free Software Foundation; either
    
       * version 2.1 of the License, or (at your option) any later version.
    
       *
    
       * FFmpeg is distributed in the hope that it will be useful,
    
       * but WITHOUT ANY WARRANTY; without even the implied warranty of
    
       * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    
       * Lesser General Public License for more details.
    
       *
    
       * You should have received a copy of the GNU Lesser General Public
    
       * License along with FFmpeg; if not, write to the Free Software
    
       * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
    
       */
    
      #include "config.h"
    
      #include "libavutil/attributes.h"
    
      #include "libavutil/macros.h"
    
      #include "libavutil/mem_internal.h"
    
      #include "libavutil/x86/asm.h"
    
      #include "fdct.h"
    
      #if HAVE_SSE2_INLINE
    
      //////////////////////////////////////////////////////////////////////
    
      //
    
      // constants for the forward DCT
    
      // -----------------------------
    
      //
    
      //////////////////////////////////////////////////////////////////////
    
      #define BITS_FRW_ACC   3 //; 2 or 3 for accuracy
    
      #define SHIFT_FRW_COL  BITS_FRW_ACC
    
      #define SHIFT_FRW_ROW  (BITS_FRW_ACC + 17 - 3)
    
      #define RND_FRW_ROW    (1 << (SHIFT_FRW_ROW-1))
    
      //#define RND_FRW_COL    (1 << (SHIFT_FRW_COL-1))
    
      #define X8(x) x,x,x,x,x,x,x,x
    
      //concatenated table, for forward DCT transformation
    
      DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
    
          X8(13036),  // tg * (2<<16) + 0.5
    
          X8(27146),  // tg * (2<<16) + 0.5
    
          X8(-21746)  // tg * (2<<16) + 0.5
    
      };
    
      DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
    
          X8(23170)   //cos * (2<<15) + 0.5
    
      };
    
      DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
    
      static const struct
    
      {
    
       DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
    
      } fdct_r_row_sse2 =
    
      {{
    
       RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
    
      }};
    
      //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
    
      static const struct
    
      {
    
       DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
    
      } tab_frw_01234567_sse2 =
    
      {{
    
      //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = {  // forward_dct coeff table
    
      #define TABLE_SSE2 C4,  C4,  C1,  C3, -C6, -C2, -C1, -C5, \
    
                         C4,  C4,  C5,  C7,  C2,  C6,  C3, -C7, \
    
                        -C4,  C4,  C7,  C3,  C6, -C2,  C7, -C5, \
    
                         C4, -C4,  C5, -C1,  C2, -C6,  C3, -C1,
    
      // c1..c7 * cos(pi/4) * 2^15
    
      #define C1 22725
    
      #define C2 21407
    
      #define C3 19266
    
      #define C4 16384
    
      #define C5 12873
    
      #define C6 8867
    
      #define C7 4520
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 31521
    
      #define C2 29692
    
      #define C3 26722
    
      #define C4 22725
    
      #define C5 17855
    
      #define C6 12299
    
      #define C7 6270
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 29692
    
      #define C2 27969
    
      #define C3 25172
    
      #define C4 21407
    
      #define C5 16819
    
      #define C6 11585
    
      #define C7 5906
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 26722
    
      #define C2 25172
    
      #define C3 22654
    
      #define C4 19266
    
      #define C5 15137
    
      #define C6 10426
    
      #define C7 5315
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 22725
    
      #define C2 21407
    
      #define C3 19266
    
      #define C4 16384
    
      #define C5 12873
    
      #define C6 8867
    
      #define C7 4520
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 26722
    
      #define C2 25172
    
      #define C3 22654
    
      #define C4 19266
    
      #define C5 15137
    
      #define C6 10426
    
      #define C7 5315
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 29692
    
      #define C2 27969
    
      #define C3 25172
    
      #define C4 21407
    
      #define C5 16819
    
      #define C6 11585
    
      #define C7 5906
    
      TABLE_SSE2
    
      #undef C1
    
      #undef C2
    
      #undef C3
    
      #undef C4
    
      #undef C5
    
      #undef C6
    
      #undef C7
    
      #define C1 31521
    
      #define C2 29692
    
      #define C3 26722
    
      #define C4 22725
    
      #define C5 17855
    
      #define C6 12299
    
      #define C7 6270
    
      TABLE_SSE2
    
      }};
    
      #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
    
      #define FDCT_COL(cpu, mm, mov)\
    
      static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
    
      {\
    
          __asm__ volatile (\
    
              #mov"      16(%0),  %%"#mm"0 \n\t" \
    
              #mov"      96(%0),  %%"#mm"1 \n\t" \
    
              #mov"    %%"#mm"0,  %%"#mm"2 \n\t" \
    
              #mov"      32(%0),  %%"#mm"3 \n\t" \
    
              "paddsw  %%"#mm"1,  %%"#mm"0 \n\t" \
    
              #mov"      80(%0),  %%"#mm"4 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
    
              #mov"        (%0),  %%"#mm"5 \n\t" \
    
              "paddsw  %%"#mm"3,  %%"#mm"4 \n\t" \
    
              "paddsw   112(%0),  %%"#mm"5 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
    
              #mov"    %%"#mm"0,  %%"#mm"6 \n\t" \
    
              "psubsw  %%"#mm"1,  %%"#mm"2 \n\t" \
    
              #mov"      16(%1),  %%"#mm"1 \n\t" \
    
              "psubsw  %%"#mm"4,  %%"#mm"0 \n\t" \
    
              #mov"      48(%0),  %%"#mm"7 \n\t" \
    
              "pmulhw  %%"#mm"0,  %%"#mm"1 \n\t" \
    
              "paddsw    64(%0),  %%"#mm"7 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
    
              "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
    
              #mov"    %%"#mm"5,  %%"#mm"4 \n\t" \
    
              "psubsw  %%"#mm"7,  %%"#mm"5 \n\t" \
    
              "paddsw  %%"#mm"5,  %%"#mm"1 \n\t" \
    
              "paddsw  %%"#mm"7,  %%"#mm"4 \n\t" \
    
              "por         (%2),  %%"#mm"1 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
    
              "pmulhw    16(%1),  %%"#mm"5 \n\t" \
    
              #mov"    %%"#mm"4,  %%"#mm"7 \n\t" \
    
              "psubsw    80(%0),  %%"#mm"3 \n\t" \
    
              "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
    
              #mov"    %%"#mm"1,    32(%3) \n\t" \
    
              "paddsw  %%"#mm"6,  %%"#mm"7 \n\t" \
    
              #mov"      48(%0),  %%"#mm"1 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
    
              "psubsw    64(%0),  %%"#mm"1 \n\t" \
    
              #mov"    %%"#mm"2,  %%"#mm"6 \n\t" \
    
              #mov"    %%"#mm"4,    64(%3) \n\t" \
    
              "paddsw  %%"#mm"3,  %%"#mm"2 \n\t" \
    
              "pmulhw      (%4),  %%"#mm"2 \n\t" \
    
              "psubsw  %%"#mm"3,  %%"#mm"6 \n\t" \
    
              "pmulhw      (%4),  %%"#mm"6 \n\t" \
    
              "psubsw  %%"#mm"0,  %%"#mm"5 \n\t" \
    
              "por         (%2),  %%"#mm"5 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
    
              "por         (%2),  %%"#mm"2 \n\t" \
    
              #mov"    %%"#mm"1,  %%"#mm"4 \n\t" \
    
              #mov"        (%0),  %%"#mm"3 \n\t" \
    
              "paddsw  %%"#mm"6,  %%"#mm"1 \n\t" \
    
              "psubsw   112(%0),  %%"#mm"3 \n\t" \
    
              "psubsw  %%"#mm"6,  %%"#mm"4 \n\t" \
    
              #mov"        (%1),  %%"#mm"0 \n\t" \
    
              "psllw  $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
    
              #mov"      32(%1),  %%"#mm"6 \n\t" \
    
              "pmulhw  %%"#mm"1,  %%"#mm"0 \n\t" \
    
              #mov"    %%"#mm"7,      (%3) \n\t" \
    
              "pmulhw  %%"#mm"4,  %%"#mm"6 \n\t" \
    
              #mov"    %%"#mm"5,    96(%3) \n\t" \
    
              #mov"    %%"#mm"3,  %%"#mm"7 \n\t" \
    
              #mov"      32(%1),  %%"#mm"5 \n\t" \
    
              "psubsw  %%"#mm"2,  %%"#mm"7 \n\t" \
    
              "paddsw  %%"#mm"2,  %%"#mm"3 \n\t" \
    
              "pmulhw  %%"#mm"7,  %%"#mm"5 \n\t" \
    
              "paddsw  %%"#mm"3,  %%"#mm"0 \n\t" \
    
              "paddsw  %%"#mm"4,  %%"#mm"6 \n\t" \
    
              "pmulhw      (%1),  %%"#mm"3 \n\t" \
    
              "por         (%2),  %%"#mm"0 \n\t" \
    
              "paddsw  %%"#mm"7,  %%"#mm"5 \n\t" \
    
              "psubsw  %%"#mm"6,  %%"#mm"7 \n\t" \
    
              #mov"    %%"#mm"0,    16(%3) \n\t" \
    
              "paddsw  %%"#mm"4,  %%"#mm"5 \n\t" \
    
              #mov"    %%"#mm"7,    48(%3) \n\t" \
    
              "psubsw  %%"#mm"1,  %%"#mm"3 \n\t" \
    
              #mov"    %%"#mm"5,    80(%3) \n\t" \
    
              #mov"    %%"#mm"3,   112(%3) \n\t" \
    
              : \
    
              : "r" (in  + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
    
                "r" (out + offset), "r" (ocos_4_16)); \
    
      }
    
      27291
      FDCT_COL(sse2, xmm, movdqa)
    
      27291
      static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
    
      {
    
      27291
          __asm__ volatile(
    
      #define FDCT_ROW_SSE2_H1(i,t)                    \
    
              "movq      " #i "(%0), %%xmm2      \n\t" \
    
              "movq      " #i "+8(%0), %%xmm0    \n\t" \
    
              "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
    
              "movdqa    " #t "+48(%1), %%xmm7   \n\t" \
    
              "movdqa    " #t "(%1), %%xmm4      \n\t" \
    
              "movdqa    " #t "+16(%1), %%xmm5   \n\t"
    
      #define FDCT_ROW_SSE2_H2(i,t)                    \
    
              "movq      " #i "(%0), %%xmm2      \n\t" \
    
              "movq      " #i "+8(%0), %%xmm0    \n\t" \
    
              "movdqa    " #t "+32(%1), %%xmm3   \n\t" \
    
              "movdqa    " #t "+48(%1), %%xmm7   \n\t"
    
      #define FDCT_ROW_SSE2(i)                      \
    
              "movq      %%xmm2, %%xmm1       \n\t" \
    
              "pshuflw   $27, %%xmm0, %%xmm0  \n\t" \
    
              "paddsw    %%xmm0, %%xmm1       \n\t" \
    
              "psubsw    %%xmm0, %%xmm2       \n\t" \
    
              "punpckldq %%xmm2, %%xmm1       \n\t" \
    
              "pshufd    $78, %%xmm1, %%xmm2  \n\t" \
    
              "pmaddwd   %%xmm2, %%xmm3       \n\t" \
    
              "pmaddwd   %%xmm1, %%xmm7       \n\t" \
    
              "pmaddwd   %%xmm5, %%xmm2       \n\t" \
    
              "pmaddwd   %%xmm4, %%xmm1       \n\t" \
    
              "paddd     %%xmm7, %%xmm3       \n\t" \
    
              "paddd     %%xmm2, %%xmm1       \n\t" \
    
              "paddd     %%xmm6, %%xmm3       \n\t" \
    
              "paddd     %%xmm6, %%xmm1       \n\t" \
    
              "psrad     %3, %%xmm3           \n\t" \
    
              "psrad     %3, %%xmm1           \n\t" \
    
              "packssdw  %%xmm3, %%xmm1       \n\t" \
    
              "movdqa    %%xmm1, " #i "(%4)   \n\t"
    
              "movdqa    (%2), %%xmm6         \n\t"
    
              FDCT_ROW_SSE2_H1(0,0)
    
              FDCT_ROW_SSE2(0)
    
              FDCT_ROW_SSE2_H2(64,0)
    
              FDCT_ROW_SSE2(64)
    
              FDCT_ROW_SSE2_H1(16,64)
    
              FDCT_ROW_SSE2(16)
    
              FDCT_ROW_SSE2_H2(112,64)
    
              FDCT_ROW_SSE2(112)
    
              FDCT_ROW_SSE2_H1(32,128)
    
              FDCT_ROW_SSE2(32)
    
              FDCT_ROW_SSE2_H2(96,128)
    
              FDCT_ROW_SSE2(96)
    
              FDCT_ROW_SSE2_H1(48,192)
    
              FDCT_ROW_SSE2(48)
    
              FDCT_ROW_SSE2_H2(80,192)
    
              FDCT_ROW_SSE2(80)
    
              :
    
              : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
    
                "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
    
                XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
    
                                  "%xmm4", "%xmm5", "%xmm6", "%xmm7")
    
          );
    
      27291
      }
    
      27291
      void ff_fdct_sse2(int16_t *block)
    
      {
    
          DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
    
      27291
          int16_t * const block1= (int16_t*)align_tmp;
    
      27291
          fdct_col_sse2(block, block1, 0);
    
      27291
          fdct_row_sse2(block1, block);
    
      27291
      }
    
      #endif /* HAVE_SSE2_INLINE */

Line	Exec	Source
1		/*
2		* SIMD-optimized forward DCT
3		* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4		* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5		* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6		*
7		* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8		*
9		* Intel Application Note AP-922 - fast, precise implementation of DCT
10		* http://developer.intel.com/vtune/cbts/appnotes.htm
11		*
12		* Also of inspiration:
13		* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14		* Skal's fdct at http://skal.planet-d.net/coding/dct.html
15		*
16		* This file is part of FFmpeg.
17		*
18		* FFmpeg is free software; you can redistribute it and/or
19		* modify it under the terms of the GNU Lesser General Public
20		* License as published by the Free Software Foundation; either
21		* version 2.1 of the License, or (at your option) any later version.
22		*
23		* FFmpeg is distributed in the hope that it will be useful,
24		* but WITHOUT ANY WARRANTY; without even the implied warranty of
25		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26		* Lesser General Public License for more details.
27		*
28		* You should have received a copy of the GNU Lesser General Public
29		* License along with FFmpeg; if not, write to the Free Software
30		* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31		*/
32
33		#include "config.h"
34		#include "libavutil/attributes.h"
35		#include "libavutil/macros.h"
36		#include "libavutil/mem_internal.h"
37		#include "libavutil/x86/asm.h"
38		#include "fdct.h"
39
40		#if HAVE_SSE2_INLINE
41
42		//////////////////////////////////////////////////////////////////////
43		//
44		// constants for the forward DCT
45		// -----------------------------
46		//
47		//////////////////////////////////////////////////////////////////////
48
49		#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
50		#define SHIFT_FRW_COL BITS_FRW_ACC
51		#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
52		#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
53		//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
54
55		#define X8(x) x,x,x,x,x,x,x,x
56
57		//concatenated table, for forward DCT transformation
58		DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
59		X8(13036), // tg * (2<<16) + 0.5
60		X8(27146), // tg * (2<<16) + 0.5
61		X8(-21746) // tg * (2<<16) + 0.5
62		};
63
64		DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
65		X8(23170) //cos * (2<<15) + 0.5
66		};
67
68		DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
69
70		static const struct
71		{
72		DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
73		} fdct_r_row_sse2 =
74		{{
75		RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
76		}};
77		//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
78
79		static const struct
80		{
81		DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
82		} tab_frw_01234567_sse2 =
83		{{
84		//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
85		#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
86		C4, C4, C5, C7, C2, C6, C3, -C7, \
87		-C4, C4, C7, C3, C6, -C2, C7, -C5, \
88		C4, -C4, C5, -C1, C2, -C6, C3, -C1,
89		// c1..c7 * cos(pi/4) * 2^15
90		#define C1 22725
91		#define C2 21407
92		#define C3 19266
93		#define C4 16384
94		#define C5 12873
95		#define C6 8867
96		#define C7 4520
97		TABLE_SSE2
98
99		#undef C1
100		#undef C2
101		#undef C3
102		#undef C4
103		#undef C5
104		#undef C6
105		#undef C7
106		#define C1 31521
107		#define C2 29692
108		#define C3 26722
109		#define C4 22725
110		#define C5 17855
111		#define C6 12299
112		#define C7 6270
113		TABLE_SSE2
114
115		#undef C1
116		#undef C2
117		#undef C3
118		#undef C4
119		#undef C5
120		#undef C6
121		#undef C7
122		#define C1 29692
123		#define C2 27969
124		#define C3 25172
125		#define C4 21407
126		#define C5 16819
127		#define C6 11585
128		#define C7 5906
129		TABLE_SSE2
130
131		#undef C1
132		#undef C2
133		#undef C3
134		#undef C4
135		#undef C5
136		#undef C6
137		#undef C7
138		#define C1 26722
139		#define C2 25172
140		#define C3 22654
141		#define C4 19266
142		#define C5 15137
143		#define C6 10426
144		#define C7 5315
145		TABLE_SSE2
146
147		#undef C1
148		#undef C2
149		#undef C3
150		#undef C4
151		#undef C5
152		#undef C6
153		#undef C7
154		#define C1 22725
155		#define C2 21407
156		#define C3 19266
157		#define C4 16384
158		#define C5 12873
159		#define C6 8867
160		#define C7 4520
161		TABLE_SSE2
162
163		#undef C1
164		#undef C2
165		#undef C3
166		#undef C4
167		#undef C5
168		#undef C6
169		#undef C7
170		#define C1 26722
171		#define C2 25172
172		#define C3 22654
173		#define C4 19266
174		#define C5 15137
175		#define C6 10426
176		#define C7 5315
177		TABLE_SSE2
178
179		#undef C1
180		#undef C2
181		#undef C3
182		#undef C4
183		#undef C5
184		#undef C6
185		#undef C7
186		#define C1 29692
187		#define C2 27969
188		#define C3 25172
189		#define C4 21407
190		#define C5 16819
191		#define C6 11585
192		#define C7 5906
193		TABLE_SSE2
194
195		#undef C1
196		#undef C2
197		#undef C3
198		#undef C4
199		#undef C5
200		#undef C6
201		#undef C7
202		#define C1 31521
203		#define C2 29692
204		#define C3 26722
205		#define C4 22725
206		#define C5 17855
207		#define C6 12299
208		#define C7 6270
209		TABLE_SSE2
210		}};
211
212		#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
213
214		#define FDCT_COL(cpu, mm, mov)\
215		static av_always_inline void fdct_col_##cpu(const int16_t in, int16_t out, int offset)\
216		{\
217		__asm__ volatile (\
218		#mov" 16(%0), %%"#mm"0 \n\t" \
219		#mov" 96(%0), %%"#mm"1 \n\t" \
220		#mov" %%"#mm"0, %%"#mm"2 \n\t" \
221		#mov" 32(%0), %%"#mm"3 \n\t" \
222		"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
223		#mov" 80(%0), %%"#mm"4 \n\t" \
224		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
225		#mov" (%0), %%"#mm"5 \n\t" \
226		"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
227		"paddsw 112(%0), %%"#mm"5 \n\t" \
228		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
229		#mov" %%"#mm"0, %%"#mm"6 \n\t" \
230		"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
231		#mov" 16(%1), %%"#mm"1 \n\t" \
232		"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
233		#mov" 48(%0), %%"#mm"7 \n\t" \
234		"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
235		"paddsw 64(%0), %%"#mm"7 \n\t" \
236		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
237		"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
238		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
239		#mov" %%"#mm"5, %%"#mm"4 \n\t" \
240		"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
241		"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
242		"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
243		"por (%2), %%"#mm"1 \n\t" \
244		"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
245		"pmulhw 16(%1), %%"#mm"5 \n\t" \
246		#mov" %%"#mm"4, %%"#mm"7 \n\t" \
247		"psubsw 80(%0), %%"#mm"3 \n\t" \
248		"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
249		#mov" %%"#mm"1, 32(%3) \n\t" \
250		"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
251		#mov" 48(%0), %%"#mm"1 \n\t" \
252		"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
253		"psubsw 64(%0), %%"#mm"1 \n\t" \
254		#mov" %%"#mm"2, %%"#mm"6 \n\t" \
255		#mov" %%"#mm"4, 64(%3) \n\t" \
256		"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
257		"pmulhw (%4), %%"#mm"2 \n\t" \
258		"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
259		"pmulhw (%4), %%"#mm"6 \n\t" \
260		"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
261		"por (%2), %%"#mm"5 \n\t" \
262		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
263		"por (%2), %%"#mm"2 \n\t" \
264		#mov" %%"#mm"1, %%"#mm"4 \n\t" \
265		#mov" (%0), %%"#mm"3 \n\t" \
266		"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
267		"psubsw 112(%0), %%"#mm"3 \n\t" \
268		"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
269		#mov" (%1), %%"#mm"0 \n\t" \
270		"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
271		#mov" 32(%1), %%"#mm"6 \n\t" \
272		"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
273		#mov" %%"#mm"7, (%3) \n\t" \
274		"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
275		#mov" %%"#mm"5, 96(%3) \n\t" \
276		#mov" %%"#mm"3, %%"#mm"7 \n\t" \
277		#mov" 32(%1), %%"#mm"5 \n\t" \
278		"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
279		"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
280		"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
281		"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
282		"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
283		"pmulhw (%1), %%"#mm"3 \n\t" \
284		"por (%2), %%"#mm"0 \n\t" \
285		"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
286		"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
287		#mov" %%"#mm"0, 16(%3) \n\t" \
288		"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
289		#mov" %%"#mm"7, 48(%3) \n\t" \
290		"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
291		#mov" %%"#mm"5, 80(%3) \n\t" \
292		#mov" %%"#mm"3, 112(%3) \n\t" \
293		: \
294		: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
295		"r" (out + offset), "r" (ocos_4_16)); \
296		}
297
298	27291	FDCT_COL(sse2, xmm, movdqa)
299
300	27291	static av_always_inline void fdct_row_sse2(const int16_t in, int16_t out)
301		{
302	27291	__asm__ volatile(
303		#define FDCT_ROW_SSE2_H1(i,t) \
304		"movq " #i "(%0), %%xmm2 \n\t" \
305		"movq " #i "+8(%0), %%xmm0 \n\t" \
306		"movdqa " #t "+32(%1), %%xmm3 \n\t" \
307		"movdqa " #t "+48(%1), %%xmm7 \n\t" \
308		"movdqa " #t "(%1), %%xmm4 \n\t" \
309		"movdqa " #t "+16(%1), %%xmm5 \n\t"
310
311		#define FDCT_ROW_SSE2_H2(i,t) \
312		"movq " #i "(%0), %%xmm2 \n\t" \
313		"movq " #i "+8(%0), %%xmm0 \n\t" \
314		"movdqa " #t "+32(%1), %%xmm3 \n\t" \
315		"movdqa " #t "+48(%1), %%xmm7 \n\t"
316
317		#define FDCT_ROW_SSE2(i) \
318		"movq %%xmm2, %%xmm1 \n\t" \
319		"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
320		"paddsw %%xmm0, %%xmm1 \n\t" \
321		"psubsw %%xmm0, %%xmm2 \n\t" \
322		"punpckldq %%xmm2, %%xmm1 \n\t" \
323		"pshufd $78, %%xmm1, %%xmm2 \n\t" \
324		"pmaddwd %%xmm2, %%xmm3 \n\t" \
325		"pmaddwd %%xmm1, %%xmm7 \n\t" \
326		"pmaddwd %%xmm5, %%xmm2 \n\t" \
327		"pmaddwd %%xmm4, %%xmm1 \n\t" \
328		"paddd %%xmm7, %%xmm3 \n\t" \
329		"paddd %%xmm2, %%xmm1 \n\t" \
330		"paddd %%xmm6, %%xmm3 \n\t" \
331		"paddd %%xmm6, %%xmm1 \n\t" \
332		"psrad %3, %%xmm3 \n\t" \
333		"psrad %3, %%xmm1 \n\t" \
334		"packssdw %%xmm3, %%xmm1 \n\t" \
335		"movdqa %%xmm1, " #i "(%4) \n\t"
336
337		"movdqa (%2), %%xmm6 \n\t"
338		FDCT_ROW_SSE2_H1(0,0)
339		FDCT_ROW_SSE2(0)
340		FDCT_ROW_SSE2_H2(64,0)
341		FDCT_ROW_SSE2(64)
342
343		FDCT_ROW_SSE2_H1(16,64)
344		FDCT_ROW_SSE2(16)
345		FDCT_ROW_SSE2_H2(112,64)
346		FDCT_ROW_SSE2(112)
347
348		FDCT_ROW_SSE2_H1(32,128)
349		FDCT_ROW_SSE2(32)
350		FDCT_ROW_SSE2_H2(96,128)
351		FDCT_ROW_SSE2(96)
352
353		FDCT_ROW_SSE2_H1(48,192)
354		FDCT_ROW_SSE2(48)
355		FDCT_ROW_SSE2_H2(80,192)
356		FDCT_ROW_SSE2(80)
357		:
358		: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
359		"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
360		XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
361		"%xmm4", "%xmm5", "%xmm6", "%xmm7")
362		);
363	27291	}
364
365	27291	void ff_fdct_sse2(int16_t *block)
366		{
367		DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
368	27291	int16_t * const block1= (int16_t*)align_tmp;
369
370	27291	fdct_col_sse2(block, block1, 0);
371	27291	fdct_row_sse2(block1, block);
372	27291	}
373
374		#endif /* HAVE_SSE2_INLINE */
375

Function (Line)	Call count	Block coverage
fdct_col_sse2 (line 298)	called 27291 times, returned 27291 times	100.0%
fdct_row_sse2 (line 300)	called 27291 times, returned 27291 times	100.0%
ff_fdct_sse2 (line 365)	called 27291 times, returned 27291 times	100.0%