| Line | Branch | Exec | Source | 
|---|---|---|---|
| 1 | /* | ||
| 2 | * VP9 SIMD optimizations | ||
| 3 | * | ||
| 4 | * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> | ||
| 5 | * | ||
| 6 | * This file is part of FFmpeg. | ||
| 7 | * | ||
| 8 | * FFmpeg is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU Lesser General Public | ||
| 10 | * License as published by the Free Software Foundation; either | ||
| 11 | * version 2.1 of the License, or (at your option) any later version. | ||
| 12 | * | ||
| 13 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 16 | * Lesser General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU Lesser General Public | ||
| 19 | * License along with FFmpeg; if not, write to the Free Software | ||
| 20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 21 | */ | ||
| 22 | |||
| 23 | #include "libavutil/attributes.h" | ||
| 24 | #include "libavutil/cpu.h" | ||
| 25 | #include "libavutil/x86/cpu.h" | ||
| 26 | #include "libavcodec/vp9dsp.h" | ||
| 27 | #include "libavcodec/x86/vp9dsp_init.h" | ||
| 28 | |||
| 29 | #if HAVE_X86ASM | ||
| 30 | |||
| 31 | decl_fpel_func(put, 4, , mmx); | ||
| 32 | decl_fpel_func(put, 8, , mmx); | ||
| 33 | decl_fpel_func(put, 16, , sse); | ||
| 34 | decl_fpel_func(put, 32, , sse); | ||
| 35 | decl_fpel_func(put, 64, , sse); | ||
| 36 | decl_fpel_func(avg, 4, _8, mmxext); | ||
| 37 | decl_fpel_func(avg, 8, _8, mmxext); | ||
| 38 | decl_fpel_func(avg, 16, _8, sse2); | ||
| 39 | decl_fpel_func(avg, 32, _8, sse2); | ||
| 40 | decl_fpel_func(avg, 64, _8, sse2); | ||
| 41 | decl_fpel_func(put, 32, , avx); | ||
| 42 | decl_fpel_func(put, 64, , avx); | ||
| 43 | decl_fpel_func(avg, 32, _8, avx2); | ||
| 44 | decl_fpel_func(avg, 64, _8, avx2); | ||
| 45 | |||
| 46 | decl_mc_funcs(4, mmxext, int16_t, 8, 8); | ||
| 47 | decl_mc_funcs(8, sse2, int16_t, 8, 8); | ||
| 48 | decl_mc_funcs(4, ssse3, int8_t, 32, 8); | ||
| 49 | decl_mc_funcs(8, ssse3, int8_t, 32, 8); | ||
| 50 | #if ARCH_X86_64 | ||
| 51 | decl_mc_funcs(16, ssse3, int8_t, 32, 8); | ||
| 52 | decl_mc_funcs(32, avx2, int8_t, 32, 8); | ||
| 53 | decl_subpel_asm( 4, 8, avx512icl); | ||
| 54 | decl_subpel_asm( 8, 8, avx512icl); | ||
| 55 | decl_subpel_asm(16, 8, avx512icl); | ||
| 56 | decl_subpel_asm(32, 8, avx512icl); | ||
| 57 | decl_subpel_asm(64, 8, avx512icl); | ||
| 58 | #endif | ||
| 59 | |||
| 60 | 672 | mc_rep_funcs(16, 8, 8, sse2, int16_t, 8, 8) | |
| 61 | #if ARCH_X86_32 | ||
| 62 | mc_rep_funcs(16, 8, 8, ssse3, int8_t, 32, 8) | ||
| 63 | #endif | ||
| 64 | 288 | mc_rep_funcs(32, 16, 16, sse2, int16_t, 8, 8) | |
| 65 | 288 | mc_rep_funcs(32, 16, 16, ssse3, int8_t, 32, 8) | |
| 66 | 96 | mc_rep_funcs(64, 32, 32, sse2, int16_t, 8, 8) | |
| 67 | 96 | mc_rep_funcs(64, 32, 32, ssse3, int8_t, 32, 8) | |
| 68 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
| 69 | 52 | mc_rep_funcs(64, 32, 32, avx2, int8_t, 32, 8) | |
| 70 | #endif | ||
| 71 | |||
| 72 | extern const int8_t ff_filters_ssse3[3][15][4][32]; | ||
| 73 | extern const int16_t ff_filters_sse2[3][15][8][8]; | ||
| 74 | |||
| 75 | 60 | filters_8tap_2d_fn2(put, 16, 8, 1, mmxext, sse2, sse2) | |
| 76 | 60 | filters_8tap_2d_fn2(avg, 16, 8, 1, mmxext, sse2, sse2) | |
| 77 | 894 | filters_8tap_2d_fn2(put, 16, 8, 1, ssse3, ssse3, ssse3) | |
| 78 | 42 | filters_8tap_2d_fn2(avg, 16, 8, 1, ssse3, ssse3, ssse3) | |
| 79 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
| 80 | 6 | filters_8tap_2d_fn(put, 64, 32, 8, 1, avx2, ssse3) | |
| 81 | 10 | filters_8tap_2d_fn(put, 32, 32, 8, 1, avx2, ssse3) | |
| 82 | 6 | filters_8tap_2d_fn(avg, 64, 32, 8, 1, avx2, ssse3) | |
| 83 | 6 | filters_8tap_2d_fn(avg, 32, 32, 8, 1, avx2, ssse3) | |
| 84 | #endif | ||
| 85 | |||
| 86 | 120 | filters_8tap_1d_fn3(put, 8, mmxext, sse2, sse2) | |
| 87 | 120 | filters_8tap_1d_fn3(avg, 8, mmxext, sse2, sse2) | |
| 88 | 628 | filters_8tap_1d_fn3(put, 8, ssse3, ssse3, ssse3) | |
| 89 | 84 | filters_8tap_1d_fn3(avg, 8, ssse3, ssse3, ssse3) | |
| 90 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
| 91 | 16 | filters_8tap_1d_fn2(put, 64, 8, avx2, ssse3) | |
| 92 | 50 | filters_8tap_1d_fn2(put, 32, 8, avx2, ssse3) | |
| 93 | 12 | filters_8tap_1d_fn2(avg, 64, 8, avx2, ssse3) | |
| 94 | 12 | filters_8tap_1d_fn2(avg, 32, 8, avx2, ssse3) | |
| 95 | #endif | ||
| 96 | |||
| 97 | #define itxfm_func(typea, typeb, size, opt) \ | ||
| 98 | void ff_vp9_##typea##_##typeb##_##size##x##size##_add_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
| 99 | int16_t *block, int eob) | ||
| 100 | #define itxfm_funcs(size, opt) \ | ||
| 101 | itxfm_func(idct, idct, size, opt); \ | ||
| 102 | itxfm_func(iadst, idct, size, opt); \ | ||
| 103 | itxfm_func(idct, iadst, size, opt); \ | ||
| 104 | itxfm_func(iadst, iadst, size, opt) | ||
| 105 | |||
| 106 | itxfm_func(idct, idct, 4, mmxext); | ||
| 107 | itxfm_func(idct, iadst, 4, sse2); | ||
| 108 | itxfm_func(iadst, idct, 4, sse2); | ||
| 109 | itxfm_func(iadst, iadst, 4, sse2); | ||
| 110 | itxfm_funcs(4, ssse3); | ||
| 111 | itxfm_funcs(4, avx2); | ||
| 112 | itxfm_funcs(8, sse2); | ||
| 113 | itxfm_funcs(8, ssse3); | ||
| 114 | itxfm_funcs(8, avx2); | ||
| 115 | itxfm_funcs(16, sse2); | ||
| 116 | itxfm_funcs(16, ssse3); | ||
| 117 | itxfm_func(idct, idct, 32, sse2); | ||
| 118 | itxfm_func(idct, idct, 32, ssse3); | ||
| 119 | itxfm_func(iwht, iwht, 4, mmx); | ||
| 120 | itxfm_func(iwht, iwht, 4, avx2); | ||
| 121 | itxfm_funcs(16, avx2); | ||
| 122 | itxfm_funcs(16, avx512icl); | ||
| 123 | itxfm_func(idct, idct, 32, avx2); | ||
| 124 | itxfm_func(idct, idct, 32, avx512icl); | ||
| 125 | |||
| 126 | #undef itxfm_func | ||
| 127 | #undef itxfm_funcs | ||
| 128 | |||
| 129 | #define lpf_funcs(size1, size2, opt) \ | ||
| 130 | void ff_vp9_loop_filter_v_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
| 131 | int E, int I, int H); \ | ||
| 132 | void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
| 133 | int E, int I, int H) | ||
| 134 | |||
| 135 | lpf_funcs(4, 8, mmxext); | ||
| 136 | lpf_funcs(8, 8, mmxext); | ||
| 137 | lpf_funcs(16, 16, sse2); | ||
| 138 | lpf_funcs(16, 16, ssse3); | ||
| 139 | lpf_funcs(16, 16, avx); | ||
| 140 | lpf_funcs(44, 16, sse2); | ||
| 141 | lpf_funcs(44, 16, ssse3); | ||
| 142 | lpf_funcs(44, 16, avx); | ||
| 143 | lpf_funcs(84, 16, sse2); | ||
| 144 | lpf_funcs(84, 16, ssse3); | ||
| 145 | lpf_funcs(84, 16, avx); | ||
| 146 | lpf_funcs(48, 16, sse2); | ||
| 147 | lpf_funcs(48, 16, ssse3); | ||
| 148 | lpf_funcs(48, 16, avx); | ||
| 149 | lpf_funcs(88, 16, sse2); | ||
| 150 | lpf_funcs(88, 16, ssse3); | ||
| 151 | lpf_funcs(88, 16, avx); | ||
| 152 | |||
| 153 | #undef lpf_funcs | ||
| 154 | |||
| 155 | #define ipred_func(size, type, opt) \ | ||
| 156 | void ff_vp9_ipred_##type##_##size##x##size##_##opt(uint8_t *dst, ptrdiff_t stride, \ | ||
| 157 | const uint8_t *l, const uint8_t *a) | ||
| 158 | |||
| 159 | ipred_func(8, v, mmx); | ||
| 160 | |||
| 161 | #define ipred_dc_funcs(size, opt) \ | ||
| 162 | ipred_func(size, dc, opt); \ | ||
| 163 | ipred_func(size, dc_left, opt); \ | ||
| 164 | ipred_func(size, dc_top, opt) | ||
| 165 | |||
| 166 | ipred_dc_funcs(4, mmxext); | ||
| 167 | ipred_dc_funcs(8, mmxext); | ||
| 168 | |||
| 169 | #define ipred_dir_tm_funcs(size, opt) \ | ||
| 170 | ipred_func(size, tm, opt); \ | ||
| 171 | ipred_func(size, dl, opt); \ | ||
| 172 | ipred_func(size, dr, opt); \ | ||
| 173 | ipred_func(size, hd, opt); \ | ||
| 174 | ipred_func(size, hu, opt); \ | ||
| 175 | ipred_func(size, vl, opt); \ | ||
| 176 | ipred_func(size, vr, opt) | ||
| 177 | |||
| 178 | ipred_dir_tm_funcs(4, mmxext); | ||
| 179 | |||
| 180 | ipred_func(16, v, sse); | ||
| 181 | ipred_func(32, v, sse); | ||
| 182 | |||
| 183 | ipred_dc_funcs(16, sse2); | ||
| 184 | ipred_dc_funcs(32, sse2); | ||
| 185 | |||
| 186 | #define ipred_dir_tm_h_funcs(size, opt) \ | ||
| 187 | ipred_dir_tm_funcs(size, opt); \ | ||
| 188 | ipred_func(size, h, opt) | ||
| 189 | |||
| 190 | ipred_dir_tm_h_funcs(8, sse2); | ||
| 191 | ipred_dir_tm_h_funcs(16, sse2); | ||
| 192 | ipred_dir_tm_h_funcs(32, sse2); | ||
| 193 | |||
| 194 | ipred_func(4, h, sse2); | ||
| 195 | |||
| 196 | #define ipred_all_funcs(size, opt) \ | ||
| 197 | ipred_dc_funcs(size, opt); \ | ||
| 198 | ipred_dir_tm_h_funcs(size, opt) | ||
| 199 | |||
| 200 | // FIXME hd/vl_4x4_ssse3 does not exist | ||
| 201 | ipred_all_funcs(4, ssse3); | ||
| 202 | ipred_all_funcs(8, ssse3); | ||
| 203 | ipred_all_funcs(16, ssse3); | ||
| 204 | ipred_all_funcs(32, ssse3); | ||
| 205 | |||
| 206 | ipred_dir_tm_h_funcs(8, avx); | ||
| 207 | ipred_dir_tm_h_funcs(16, avx); | ||
| 208 | ipred_dir_tm_h_funcs(32, avx); | ||
| 209 | |||
| 210 | ipred_all_funcs(32, avx2); | ||
| 211 | ipred_func(32, v, avx2); | ||
| 212 | |||
| 213 | #undef ipred_func | ||
| 214 | #undef ipred_dir_tm_h_funcs | ||
| 215 | #undef ipred_dir_tm_funcs | ||
| 216 | #undef ipred_dc_funcs | ||
| 217 | |||
| 218 | #endif /* HAVE_X86ASM */ | ||
| 219 | |||
| 220 | 674 | av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int bpp, int bitexact) | |
| 221 | { | ||
| 222 | #if HAVE_X86ASM | ||
| 223 | int cpu_flags; | ||
| 224 | |||
| 225 | 2/2✓ Branch 0 taken 81 times. ✓ Branch 1 taken 593 times. | 674 | if (bpp == 10) { | 
| 226 | 81 | ff_vp9dsp_init_10bpp_x86(dsp, bitexact); | |
| 227 | 81 | return; | |
| 228 | 2/2✓ Branch 0 taken 75 times. ✓ Branch 1 taken 518 times. | 593 | } else if (bpp == 12) { | 
| 229 | 75 | ff_vp9dsp_init_12bpp_x86(dsp, bitexact); | |
| 230 | 75 | return; | |
| 231 | } | ||
| 232 | |||
| 233 | 518 | cpu_flags = av_get_cpu_flags(); | |
| 234 | |||
| 235 | #define init_lpf(opt) do { \ | ||
| 236 | dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_##opt; \ | ||
| 237 | dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \ | ||
| 238 | dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \ | ||
| 239 | dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \ | ||
| 240 | dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \ | ||
| 241 | dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \ | ||
| 242 | dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \ | ||
| 243 | dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \ | ||
| 244 | dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \ | ||
| 245 | dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \ | ||
| 246 | } while (0) | ||
| 247 | |||
| 248 | #define init_ipred(sz, opt, t, e) \ | ||
| 249 | dsp->intra_pred[TX_##sz##X##sz][e##_PRED] = ff_vp9_ipred_##t##_##sz##x##sz##_##opt | ||
| 250 | |||
| 251 | #define ff_vp9_ipred_hd_4x4_ssse3 ff_vp9_ipred_hd_4x4_mmxext | ||
| 252 | #define ff_vp9_ipred_vl_4x4_ssse3 ff_vp9_ipred_vl_4x4_mmxext | ||
| 253 | #define init_dir_tm_ipred(sz, opt) do { \ | ||
| 254 | init_ipred(sz, opt, dl, DIAG_DOWN_LEFT); \ | ||
| 255 | init_ipred(sz, opt, dr, DIAG_DOWN_RIGHT); \ | ||
| 256 | init_ipred(sz, opt, hd, HOR_DOWN); \ | ||
| 257 | init_ipred(sz, opt, vl, VERT_LEFT); \ | ||
| 258 | init_ipred(sz, opt, hu, HOR_UP); \ | ||
| 259 | init_ipred(sz, opt, tm, TM_VP8); \ | ||
| 260 | init_ipred(sz, opt, vr, VERT_RIGHT); \ | ||
| 261 | } while (0) | ||
| 262 | #define init_dir_tm_h_ipred(sz, opt) do { \ | ||
| 263 | init_dir_tm_ipred(sz, opt); \ | ||
| 264 | init_ipred(sz, opt, h, HOR); \ | ||
| 265 | } while (0) | ||
| 266 | #define init_dc_ipred(sz, opt) do { \ | ||
| 267 | init_ipred(sz, opt, dc, DC); \ | ||
| 268 | init_ipred(sz, opt, dc_left, LEFT_DC); \ | ||
| 269 | init_ipred(sz, opt, dc_top, TOP_DC); \ | ||
| 270 | } while (0) | ||
| 271 | #define init_all_ipred(sz, opt) do { \ | ||
| 272 | init_dc_ipred(sz, opt); \ | ||
| 273 | init_dir_tm_h_ipred(sz, opt); \ | ||
| 274 | } while (0) | ||
| 275 | |||
| 276 | 2/2✓ Branch 0 taken 62 times. ✓ Branch 1 taken 456 times. | 518 | if (EXTERNAL_MMX(cpu_flags)) { | 
| 277 | 62 | init_fpel_func(4, 0, 4, put, , mmx); | |
| 278 | 62 | init_fpel_func(3, 0, 8, put, , mmx); | |
| 279 | 1/2✓ Branch 0 taken 62 times. ✗ Branch 1 not taken. | 62 | if (!bitexact) { | 
| 280 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_DCT] = | |
| 281 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_DCT] = | |
| 282 | 62 | dsp->itxfm_add[4 /* lossless */][DCT_ADST] = | |
| 283 | 62 | dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_mmx; | |
| 284 | } | ||
| 285 | 62 | init_ipred(8, mmx, v, VERT); | |
| 286 | } | ||
| 287 | |||
| 288 | 2/2✓ Branch 0 taken 57 times. ✓ Branch 1 taken 461 times. | 518 | if (EXTERNAL_MMXEXT(cpu_flags)) { | 
| 289 | 57 | dsp->loop_filter_8[0][0] = ff_vp9_loop_filter_h_4_8_mmxext; | |
| 290 | 57 | dsp->loop_filter_8[0][1] = ff_vp9_loop_filter_v_4_8_mmxext; | |
| 291 | 57 | dsp->loop_filter_8[1][0] = ff_vp9_loop_filter_h_8_8_mmxext; | |
| 292 | 57 | dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext; | |
| 293 | 57 | init_subpel2(4, 0, 4, put, 8, mmxext); | |
| 294 | 57 | init_subpel2(4, 1, 4, avg, 8, mmxext); | |
| 295 | 57 | init_fpel_func(4, 1, 4, avg, _8, mmxext); | |
| 296 | 57 | init_fpel_func(3, 1, 8, avg, _8, mmxext); | |
| 297 | 57 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext; | |
| 298 | 57 | init_dc_ipred(4, mmxext); | |
| 299 | 57 | init_dc_ipred(8, mmxext); | |
| 300 | 57 | init_dir_tm_ipred(4, mmxext); | |
| 301 | } | ||
| 302 | |||
| 303 | 2/2✓ Branch 0 taken 52 times. ✓ Branch 1 taken 466 times. | 518 | if (EXTERNAL_SSE(cpu_flags)) { | 
| 304 | 52 | init_fpel_func(2, 0, 16, put, , sse); | |
| 305 | 52 | init_fpel_func(1, 0, 32, put, , sse); | |
| 306 | 52 | init_fpel_func(0, 0, 64, put, , sse); | |
| 307 | 52 | init_ipred(16, sse, v, VERT); | |
| 308 | 52 | init_ipred(32, sse, v, VERT); | |
| 309 | } | ||
| 310 | |||
| 311 | 2/2✓ Branch 0 taken 47 times. ✓ Branch 1 taken 471 times. | 518 | if (EXTERNAL_SSE2(cpu_flags)) { | 
| 312 | 47 | init_subpel3_8to64(0, put, 8, sse2); | |
| 313 | 47 | init_subpel3_8to64(1, avg, 8, sse2); | |
| 314 | 47 | init_fpel_func(2, 1, 16, avg, _8, sse2); | |
| 315 | 47 | init_fpel_func(1, 1, 32, avg, _8, sse2); | |
| 316 | 47 | init_fpel_func(0, 1, 64, avg, _8, sse2); | |
| 317 | 47 | init_lpf(sse2); | |
| 318 | 47 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_sse2; | |
| 319 | 47 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_sse2; | |
| 320 | 47 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_sse2; | |
| 321 | 47 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_sse2; | |
| 322 | 47 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_sse2; | |
| 323 | 47 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_sse2; | |
| 324 | 47 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_sse2; | |
| 325 | 47 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_sse2; | |
| 326 | 47 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_sse2; | |
| 327 | 47 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_sse2; | |
| 328 | 47 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_sse2; | |
| 329 | 47 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
| 330 | 47 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
| 331 | 47 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
| 332 | 47 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_sse2; | |
| 333 | 47 | init_dc_ipred(16, sse2); | |
| 334 | 47 | init_dc_ipred(32, sse2); | |
| 335 | 47 | init_dir_tm_h_ipred(8, sse2); | |
| 336 | 47 | init_dir_tm_h_ipred(16, sse2); | |
| 337 | 47 | init_dir_tm_h_ipred(32, sse2); | |
| 338 | 47 | init_ipred(4, sse2, h, HOR); | |
| 339 | } | ||
| 340 | |||
| 341 | 2/2✓ Branch 0 taken 37 times. ✓ Branch 1 taken 481 times. | 518 | if (EXTERNAL_SSSE3(cpu_flags)) { | 
| 342 | 37 | init_subpel3(0, put, 8, ssse3); | |
| 343 | 37 | init_subpel3(1, avg, 8, ssse3); | |
| 344 | 37 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3; | |
| 345 | 37 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_ssse3; | |
| 346 | 37 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_ssse3; | |
| 347 | 37 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_ssse3; | |
| 348 | 37 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3; | |
| 349 | 37 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_ssse3; | |
| 350 | 37 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_ssse3; | |
| 351 | 37 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_ssse3; | |
| 352 | 37 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3; | |
| 353 | 37 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_ssse3; | |
| 354 | 37 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_ssse3; | |
| 355 | 37 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_ssse3; | |
| 356 | 37 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
| 357 | 37 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
| 358 | 37 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
| 359 | 37 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3; | |
| 360 | 37 | init_lpf(ssse3); | |
| 361 | 37 | init_all_ipred(4, ssse3); | |
| 362 | 37 | init_all_ipred(8, ssse3); | |
| 363 | 37 | init_all_ipred(16, ssse3); | |
| 364 | 37 | init_all_ipred(32, ssse3); | |
| 365 | } | ||
| 366 | |||
| 367 | 2/2✓ Branch 0 taken 17 times. ✓ Branch 1 taken 501 times. | 518 | if (EXTERNAL_AVX(cpu_flags)) { | 
| 368 | 17 | init_lpf(avx); | |
| 369 | 17 | init_dir_tm_h_ipred(8, avx); | |
| 370 | 17 | init_dir_tm_h_ipred(16, avx); | |
| 371 | 17 | init_dir_tm_h_ipred(32, avx); | |
| 372 | } | ||
| 373 | 3/4✓ Branch 0 taken 17 times. ✓ Branch 1 taken 501 times. ✓ Branch 2 taken 17 times. ✗ Branch 3 not taken. | 518 | if (EXTERNAL_AVX_FAST(cpu_flags)) { | 
| 374 | 17 | init_fpel_func(1, 0, 32, put, , avx); | |
| 375 | 17 | init_fpel_func(0, 0, 64, put, , avx); | |
| 376 | } | ||
| 377 | |||
| 378 | 3/4✓ Branch 0 taken 7 times. ✓ Branch 1 taken 511 times. ✓ Branch 2 taken 7 times. ✗ Branch 3 not taken. | 518 | if (EXTERNAL_AVX2_FAST(cpu_flags)) { | 
| 379 | 7 | init_fpel_func(1, 1, 32, avg, _8, avx2); | |
| 380 | 7 | init_fpel_func(0, 1, 64, avg, _8, avx2); | |
| 381 | if (ARCH_X86_64) { | ||
| 382 | #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL | ||
| 383 | 7 | dsp->itxfm_add[4 /* lossless */][DCT_DCT] = | |
| 384 | 7 | dsp->itxfm_add[4 /* lossless */][ADST_DCT] = | |
| 385 | 7 | dsp->itxfm_add[4 /* lossless */][DCT_ADST] = | |
| 386 | 7 | dsp->itxfm_add[4 /* lossless */][ADST_ADST] = ff_vp9_iwht_iwht_4x4_add_avx2; | |
| 387 | 7 | dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_avx2; | |
| 388 | 7 | dsp->itxfm_add[TX_4X4][ADST_DCT] = ff_vp9_idct_iadst_4x4_add_avx2; | |
| 389 | 7 | dsp->itxfm_add[TX_4X4][DCT_ADST] = ff_vp9_iadst_idct_4x4_add_avx2; | |
| 390 | 7 | dsp->itxfm_add[TX_4X4][ADST_ADST] = ff_vp9_iadst_iadst_4x4_add_avx2; | |
| 391 | 7 | dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_avx2; | |
| 392 | 7 | dsp->itxfm_add[TX_8X8][ADST_DCT] = ff_vp9_idct_iadst_8x8_add_avx2; | |
| 393 | 7 | dsp->itxfm_add[TX_8X8][DCT_ADST] = ff_vp9_iadst_idct_8x8_add_avx2; | |
| 394 | 7 | dsp->itxfm_add[TX_8X8][ADST_ADST] = ff_vp9_iadst_iadst_8x8_add_avx2; | |
| 395 | 7 | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx2; | |
| 396 | 7 | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx2; | |
| 397 | 7 | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx2; | |
| 398 | 7 | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx2; | |
| 399 | 7 | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
| 400 | 7 | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
| 401 | 7 | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
| 402 | 7 | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx2; | |
| 403 | 7 | init_subpel3_32_64(0, put, 8, avx2); | |
| 404 | 7 | init_subpel3_32_64(1, avg, 8, avx2); | |
| 405 | #endif | ||
| 406 | } | ||
| 407 | 7 | init_all_ipred(32, avx2); | |
| 408 | 7 | init_ipred(32, avx2, v, VERT); | |
| 409 | } | ||
| 410 | |||
| 411 | #if ARCH_X86_64 | ||
| 412 | 1/2✗ Branch 0 not taken. ✓ Branch 1 taken 518 times. | 518 | if (EXTERNAL_AVX512ICL(cpu_flags)) { | 
| 413 | ✗ | dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_avx512icl; | |
| 414 | ✗ | dsp->itxfm_add[TX_16X16][ADST_DCT] = ff_vp9_idct_iadst_16x16_add_avx512icl; | |
| 415 | ✗ | dsp->itxfm_add[TX_16X16][DCT_ADST] = ff_vp9_iadst_idct_16x16_add_avx512icl; | |
| 416 | ✗ | dsp->itxfm_add[TX_16X16][ADST_ADST] = ff_vp9_iadst_iadst_16x16_add_avx512icl; | |
| 417 | ✗ | dsp->itxfm_add[TX_32X32][ADST_ADST] = | |
| 418 | ✗ | dsp->itxfm_add[TX_32X32][ADST_DCT] = | |
| 419 | ✗ | dsp->itxfm_add[TX_32X32][DCT_ADST] = | |
| 420 | ✗ | dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_avx512icl; | |
| 421 | ✗ | init_subpel_asm(4, 4, 8, avx512icl); | |
| 422 | ✗ | init_subpel_asm(3, 8, 8, avx512icl); | |
| 423 | ✗ | init_subpel_asm(2, 16, 8, avx512icl); | |
| 424 | ✗ | init_subpel_asm(1, 32, 8, avx512icl); | |
| 425 | ✗ | init_subpel_asm(0, 64, 8, avx512icl); | |
| 426 | } | ||
| 427 | #endif | ||
| 428 | |||
| 429 | #undef init_fpel | ||
| 430 | #undef init_subpel1 | ||
| 431 | #undef init_subpel2 | ||
| 432 | #undef init_subpel3 | ||
| 433 | |||
| 434 | #endif /* HAVE_X86ASM */ | ||
| 435 | } | ||
| 436 |