| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | * SIMD-optimized forward DCT | ||
| 3 | * The gcc porting is Copyright (c) 2001 Fabrice Bellard. | ||
| 4 | * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | ||
| 5 | * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. | ||
| 6 | * | ||
| 7 | * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT | ||
| 8 | * | ||
| 9 | * Intel Application Note AP-922 - fast, precise implementation of DCT | ||
| 10 | * http://developer.intel.com/vtune/cbts/appnotes.htm | ||
| 11 | * | ||
| 12 | * Also of inspiration: | ||
| 13 | * a page about fdct at http://www.geocities.com/ssavekar/dct.htm | ||
| 14 | * Skal's fdct at http://skal.planet-d.net/coding/dct.html | ||
| 15 | * | ||
| 16 | * This file is part of FFmpeg. | ||
| 17 | * | ||
| 18 | * FFmpeg is free software; you can redistribute it and/or | ||
| 19 | * modify it under the terms of the GNU Lesser General Public | ||
| 20 | * License as published by the Free Software Foundation; either | ||
| 21 | * version 2.1 of the License, or (at your option) any later version. | ||
| 22 | * | ||
| 23 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 26 | * Lesser General Public License for more details. | ||
| 27 | * | ||
| 28 | * You should have received a copy of the GNU Lesser General Public | ||
| 29 | * License along with FFmpeg; if not, write to the Free Software | ||
| 30 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 31 | */ | ||
| 32 | |||
| 33 | #include "config.h" | ||
| 34 | #include "libavutil/attributes.h" | ||
| 35 | #include "libavutil/macros.h" | ||
| 36 | #include "libavutil/mem_internal.h" | ||
| 37 | #include "libavutil/x86/asm.h" | ||
| 38 | #include "fdct.h" | ||
| 39 | |||
| 40 | #if HAVE_SSE2_INLINE | ||
| 41 | |||
| 42 | ////////////////////////////////////////////////////////////////////// | ||
| 43 | // | ||
| 44 | // constants for the forward DCT | ||
| 45 | // ----------------------------- | ||
| 46 | // | ||
| 47 | ////////////////////////////////////////////////////////////////////// | ||
| 48 | |||
| 49 | #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy | ||
| 50 | #define SHIFT_FRW_COL BITS_FRW_ACC | ||
| 51 | #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) | ||
| 52 | #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) | ||
| 53 | //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) | ||
| 54 | |||
| 55 | #define X8(x) x,x,x,x,x,x,x,x | ||
| 56 | |||
| 57 | //concatenated table, for forward DCT transformation | ||
| 58 | DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { | ||
| 59 | X8(13036), // tg * (2<<16) + 0.5 | ||
| 60 | X8(27146), // tg * (2<<16) + 0.5 | ||
| 61 | X8(-21746) // tg * (2<<16) + 0.5 | ||
| 62 | }; | ||
| 63 | |||
| 64 | DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { | ||
| 65 | X8(23170) //cos * (2<<15) + 0.5 | ||
| 66 | }; | ||
| 67 | |||
| 68 | DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; | ||
| 69 | |||
| 70 | static const struct | ||
| 71 | { | ||
| 72 | DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; | ||
| 73 | } fdct_r_row_sse2 = | ||
| 74 | {{ | ||
| 75 | RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW | ||
| 76 | }}; | ||
| 77 | //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; | ||
| 78 | |||
| 79 | static const struct | ||
| 80 | { | ||
| 81 | DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; | ||
| 82 | } tab_frw_01234567_sse2 = | ||
| 83 | {{ | ||
| 84 | //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table | ||
| 85 | #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ | ||
| 86 | C4, C4, C5, C7, C2, C6, C3, -C7, \ | ||
| 87 | -C4, C4, C7, C3, C6, -C2, C7, -C5, \ | ||
| 88 | C4, -C4, C5, -C1, C2, -C6, C3, -C1, | ||
| 89 | // c1..c7 * cos(pi/4) * 2^15 | ||
| 90 | #define C1 22725 | ||
| 91 | #define C2 21407 | ||
| 92 | #define C3 19266 | ||
| 93 | #define C4 16384 | ||
| 94 | #define C5 12873 | ||
| 95 | #define C6 8867 | ||
| 96 | #define C7 4520 | ||
| 97 | TABLE_SSE2 | ||
| 98 | |||
| 99 | #undef C1 | ||
| 100 | #undef C2 | ||
| 101 | #undef C3 | ||
| 102 | #undef C4 | ||
| 103 | #undef C5 | ||
| 104 | #undef C6 | ||
| 105 | #undef C7 | ||
| 106 | #define C1 31521 | ||
| 107 | #define C2 29692 | ||
| 108 | #define C3 26722 | ||
| 109 | #define C4 22725 | ||
| 110 | #define C5 17855 | ||
| 111 | #define C6 12299 | ||
| 112 | #define C7 6270 | ||
| 113 | TABLE_SSE2 | ||
| 114 | |||
| 115 | #undef C1 | ||
| 116 | #undef C2 | ||
| 117 | #undef C3 | ||
| 118 | #undef C4 | ||
| 119 | #undef C5 | ||
| 120 | #undef C6 | ||
| 121 | #undef C7 | ||
| 122 | #define C1 29692 | ||
| 123 | #define C2 27969 | ||
| 124 | #define C3 25172 | ||
| 125 | #define C4 21407 | ||
| 126 | #define C5 16819 | ||
| 127 | #define C6 11585 | ||
| 128 | #define C7 5906 | ||
| 129 | TABLE_SSE2 | ||
| 130 | |||
| 131 | #undef C1 | ||
| 132 | #undef C2 | ||
| 133 | #undef C3 | ||
| 134 | #undef C4 | ||
| 135 | #undef C5 | ||
| 136 | #undef C6 | ||
| 137 | #undef C7 | ||
| 138 | #define C1 26722 | ||
| 139 | #define C2 25172 | ||
| 140 | #define C3 22654 | ||
| 141 | #define C4 19266 | ||
| 142 | #define C5 15137 | ||
| 143 | #define C6 10426 | ||
| 144 | #define C7 5315 | ||
| 145 | TABLE_SSE2 | ||
| 146 | |||
| 147 | #undef C1 | ||
| 148 | #undef C2 | ||
| 149 | #undef C3 | ||
| 150 | #undef C4 | ||
| 151 | #undef C5 | ||
| 152 | #undef C6 | ||
| 153 | #undef C7 | ||
| 154 | #define C1 22725 | ||
| 155 | #define C2 21407 | ||
| 156 | #define C3 19266 | ||
| 157 | #define C4 16384 | ||
| 158 | #define C5 12873 | ||
| 159 | #define C6 8867 | ||
| 160 | #define C7 4520 | ||
| 161 | TABLE_SSE2 | ||
| 162 | |||
| 163 | #undef C1 | ||
| 164 | #undef C2 | ||
| 165 | #undef C3 | ||
| 166 | #undef C4 | ||
| 167 | #undef C5 | ||
| 168 | #undef C6 | ||
| 169 | #undef C7 | ||
| 170 | #define C1 26722 | ||
| 171 | #define C2 25172 | ||
| 172 | #define C3 22654 | ||
| 173 | #define C4 19266 | ||
| 174 | #define C5 15137 | ||
| 175 | #define C6 10426 | ||
| 176 | #define C7 5315 | ||
| 177 | TABLE_SSE2 | ||
| 178 | |||
| 179 | #undef C1 | ||
| 180 | #undef C2 | ||
| 181 | #undef C3 | ||
| 182 | #undef C4 | ||
| 183 | #undef C5 | ||
| 184 | #undef C6 | ||
| 185 | #undef C7 | ||
| 186 | #define C1 29692 | ||
| 187 | #define C2 27969 | ||
| 188 | #define C3 25172 | ||
| 189 | #define C4 21407 | ||
| 190 | #define C5 16819 | ||
| 191 | #define C6 11585 | ||
| 192 | #define C7 5906 | ||
| 193 | TABLE_SSE2 | ||
| 194 | |||
| 195 | #undef C1 | ||
| 196 | #undef C2 | ||
| 197 | #undef C3 | ||
| 198 | #undef C4 | ||
| 199 | #undef C5 | ||
| 200 | #undef C6 | ||
| 201 | #undef C7 | ||
| 202 | #define C1 31521 | ||
| 203 | #define C2 29692 | ||
| 204 | #define C3 26722 | ||
| 205 | #define C4 22725 | ||
| 206 | #define C5 17855 | ||
| 207 | #define C6 12299 | ||
| 208 | #define C7 6270 | ||
| 209 | TABLE_SSE2 | ||
| 210 | }}; | ||
| 211 | |||
| 212 | #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long | ||
| 213 | |||
| 214 | #define FDCT_COL(cpu, mm, mov)\ | ||
| 215 | static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ | ||
| 216 | {\ | ||
| 217 | __asm__ volatile (\ | ||
| 218 | #mov" 16(%0), %%"#mm"0 \n\t" \ | ||
| 219 | #mov" 96(%0), %%"#mm"1 \n\t" \ | ||
| 220 | #mov" %%"#mm"0, %%"#mm"2 \n\t" \ | ||
| 221 | #mov" 32(%0), %%"#mm"3 \n\t" \ | ||
| 222 | "paddsw %%"#mm"1, %%"#mm"0 \n\t" \ | ||
| 223 | #mov" 80(%0), %%"#mm"4 \n\t" \ | ||
| 224 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ | ||
| 225 | #mov" (%0), %%"#mm"5 \n\t" \ | ||
| 226 | "paddsw %%"#mm"3, %%"#mm"4 \n\t" \ | ||
| 227 | "paddsw 112(%0), %%"#mm"5 \n\t" \ | ||
| 228 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ | ||
| 229 | #mov" %%"#mm"0, %%"#mm"6 \n\t" \ | ||
| 230 | "psubsw %%"#mm"1, %%"#mm"2 \n\t" \ | ||
| 231 | #mov" 16(%1), %%"#mm"1 \n\t" \ | ||
| 232 | "psubsw %%"#mm"4, %%"#mm"0 \n\t" \ | ||
| 233 | #mov" 48(%0), %%"#mm"7 \n\t" \ | ||
| 234 | "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ | ||
| 235 | "paddsw 64(%0), %%"#mm"7 \n\t" \ | ||
| 236 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ | ||
| 237 | "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | ||
| 238 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ | ||
| 239 | #mov" %%"#mm"5, %%"#mm"4 \n\t" \ | ||
| 240 | "psubsw %%"#mm"7, %%"#mm"5 \n\t" \ | ||
| 241 | "paddsw %%"#mm"5, %%"#mm"1 \n\t" \ | ||
| 242 | "paddsw %%"#mm"7, %%"#mm"4 \n\t" \ | ||
| 243 | "por (%2), %%"#mm"1 \n\t" \ | ||
| 244 | "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ | ||
| 245 | "pmulhw 16(%1), %%"#mm"5 \n\t" \ | ||
| 246 | #mov" %%"#mm"4, %%"#mm"7 \n\t" \ | ||
| 247 | "psubsw 80(%0), %%"#mm"3 \n\t" \ | ||
| 248 | "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | ||
| 249 | #mov" %%"#mm"1, 32(%3) \n\t" \ | ||
| 250 | "paddsw %%"#mm"6, %%"#mm"7 \n\t" \ | ||
| 251 | #mov" 48(%0), %%"#mm"1 \n\t" \ | ||
| 252 | "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ | ||
| 253 | "psubsw 64(%0), %%"#mm"1 \n\t" \ | ||
| 254 | #mov" %%"#mm"2, %%"#mm"6 \n\t" \ | ||
| 255 | #mov" %%"#mm"4, 64(%3) \n\t" \ | ||
| 256 | "paddsw %%"#mm"3, %%"#mm"2 \n\t" \ | ||
| 257 | "pmulhw (%4), %%"#mm"2 \n\t" \ | ||
| 258 | "psubsw %%"#mm"3, %%"#mm"6 \n\t" \ | ||
| 259 | "pmulhw (%4), %%"#mm"6 \n\t" \ | ||
| 260 | "psubsw %%"#mm"0, %%"#mm"5 \n\t" \ | ||
| 261 | "por (%2), %%"#mm"5 \n\t" \ | ||
| 262 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ | ||
| 263 | "por (%2), %%"#mm"2 \n\t" \ | ||
| 264 | #mov" %%"#mm"1, %%"#mm"4 \n\t" \ | ||
| 265 | #mov" (%0), %%"#mm"3 \n\t" \ | ||
| 266 | "paddsw %%"#mm"6, %%"#mm"1 \n\t" \ | ||
| 267 | "psubsw 112(%0), %%"#mm"3 \n\t" \ | ||
| 268 | "psubsw %%"#mm"6, %%"#mm"4 \n\t" \ | ||
| 269 | #mov" (%1), %%"#mm"0 \n\t" \ | ||
| 270 | "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ | ||
| 271 | #mov" 32(%1), %%"#mm"6 \n\t" \ | ||
| 272 | "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ | ||
| 273 | #mov" %%"#mm"7, (%3) \n\t" \ | ||
| 274 | "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ | ||
| 275 | #mov" %%"#mm"5, 96(%3) \n\t" \ | ||
| 276 | #mov" %%"#mm"3, %%"#mm"7 \n\t" \ | ||
| 277 | #mov" 32(%1), %%"#mm"5 \n\t" \ | ||
| 278 | "psubsw %%"#mm"2, %%"#mm"7 \n\t" \ | ||
| 279 | "paddsw %%"#mm"2, %%"#mm"3 \n\t" \ | ||
| 280 | "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ | ||
| 281 | "paddsw %%"#mm"3, %%"#mm"0 \n\t" \ | ||
| 282 | "paddsw %%"#mm"4, %%"#mm"6 \n\t" \ | ||
| 283 | "pmulhw (%1), %%"#mm"3 \n\t" \ | ||
| 284 | "por (%2), %%"#mm"0 \n\t" \ | ||
| 285 | "paddsw %%"#mm"7, %%"#mm"5 \n\t" \ | ||
| 286 | "psubsw %%"#mm"6, %%"#mm"7 \n\t" \ | ||
| 287 | #mov" %%"#mm"0, 16(%3) \n\t" \ | ||
| 288 | "paddsw %%"#mm"4, %%"#mm"5 \n\t" \ | ||
| 289 | #mov" %%"#mm"7, 48(%3) \n\t" \ | ||
| 290 | "psubsw %%"#mm"1, %%"#mm"3 \n\t" \ | ||
| 291 | #mov" %%"#mm"5, 80(%3) \n\t" \ | ||
| 292 | #mov" %%"#mm"3, 112(%3) \n\t" \ | ||
| 293 | : \ | ||
| 294 | : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ | ||
| 295 | "r" (out + offset), "r" (ocos_4_16)); \ | ||
| 296 | } | ||
| 297 | |||
| 298 | 20001 | FDCT_COL(sse2, xmm, movdqa) | |
| 299 | |||
| 300 | 20001 | static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) | |
| 301 | { | ||
| 302 | 20001 | __asm__ volatile( | |
| 303 | #define FDCT_ROW_SSE2_H1(i,t) \ | ||
| 304 | "movq " #i "(%0), %%xmm2 \n\t" \ | ||
| 305 | "movq " #i "+8(%0), %%xmm0 \n\t" \ | ||
| 306 | "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | ||
| 307 | "movdqa " #t "+48(%1), %%xmm7 \n\t" \ | ||
| 308 | "movdqa " #t "(%1), %%xmm4 \n\t" \ | ||
| 309 | "movdqa " #t "+16(%1), %%xmm5 \n\t" | ||
| 310 | |||
| 311 | #define FDCT_ROW_SSE2_H2(i,t) \ | ||
| 312 | "movq " #i "(%0), %%xmm2 \n\t" \ | ||
| 313 | "movq " #i "+8(%0), %%xmm0 \n\t" \ | ||
| 314 | "movdqa " #t "+32(%1), %%xmm3 \n\t" \ | ||
| 315 | "movdqa " #t "+48(%1), %%xmm7 \n\t" | ||
| 316 | |||
| 317 | #define FDCT_ROW_SSE2(i) \ | ||
| 318 | "movq %%xmm2, %%xmm1 \n\t" \ | ||
| 319 | "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ | ||
| 320 | "paddsw %%xmm0, %%xmm1 \n\t" \ | ||
| 321 | "psubsw %%xmm0, %%xmm2 \n\t" \ | ||
| 322 | "punpckldq %%xmm2, %%xmm1 \n\t" \ | ||
| 323 | "pshufd $78, %%xmm1, %%xmm2 \n\t" \ | ||
| 324 | "pmaddwd %%xmm2, %%xmm3 \n\t" \ | ||
| 325 | "pmaddwd %%xmm1, %%xmm7 \n\t" \ | ||
| 326 | "pmaddwd %%xmm5, %%xmm2 \n\t" \ | ||
| 327 | "pmaddwd %%xmm4, %%xmm1 \n\t" \ | ||
| 328 | "paddd %%xmm7, %%xmm3 \n\t" \ | ||
| 329 | "paddd %%xmm2, %%xmm1 \n\t" \ | ||
| 330 | "paddd %%xmm6, %%xmm3 \n\t" \ | ||
| 331 | "paddd %%xmm6, %%xmm1 \n\t" \ | ||
| 332 | "psrad %3, %%xmm3 \n\t" \ | ||
| 333 | "psrad %3, %%xmm1 \n\t" \ | ||
| 334 | "packssdw %%xmm3, %%xmm1 \n\t" \ | ||
| 335 | "movdqa %%xmm1, " #i "(%4) \n\t" | ||
| 336 | |||
| 337 | "movdqa (%2), %%xmm6 \n\t" | ||
| 338 | FDCT_ROW_SSE2_H1(0,0) | ||
| 339 | FDCT_ROW_SSE2(0) | ||
| 340 | FDCT_ROW_SSE2_H2(64,0) | ||
| 341 | FDCT_ROW_SSE2(64) | ||
| 342 | |||
| 343 | FDCT_ROW_SSE2_H1(16,64) | ||
| 344 | FDCT_ROW_SSE2(16) | ||
| 345 | FDCT_ROW_SSE2_H2(112,64) | ||
| 346 | FDCT_ROW_SSE2(112) | ||
| 347 | |||
| 348 | FDCT_ROW_SSE2_H1(32,128) | ||
| 349 | FDCT_ROW_SSE2(32) | ||
| 350 | FDCT_ROW_SSE2_H2(96,128) | ||
| 351 | FDCT_ROW_SSE2(96) | ||
| 352 | |||
| 353 | FDCT_ROW_SSE2_H1(48,192) | ||
| 354 | FDCT_ROW_SSE2(48) | ||
| 355 | FDCT_ROW_SSE2_H2(80,192) | ||
| 356 | FDCT_ROW_SSE2(80) | ||
| 357 | : | ||
| 358 | : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), | ||
| 359 | "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) | ||
| 360 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", | ||
| 361 | "%xmm4", "%xmm5", "%xmm6", "%xmm7") | ||
| 362 | ); | ||
| 363 | 20001 | } | |
| 364 | |||
| 365 | 20001 | void ff_fdct_sse2(int16_t *block) | |
| 366 | { | ||
| 367 | DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; | ||
| 368 | 20001 | int16_t * const block1= (int16_t*)align_tmp; | |
| 369 | |||
| 370 | 20001 | fdct_col_sse2(block, block1, 0); | |
| 371 | 20001 | fdct_row_sse2(block1, block); | |
| 372 | 20001 | } | |
| 373 | |||
| 374 | #endif /* HAVE_SSE2_INLINE */ | ||
| 375 |