| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | * VC-1 and WMV3 - DSP functions MMX-optimized | ||
| 3 | * Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> | ||
| 4 | * | ||
| 5 | * Permission is hereby granted, free of charge, to any person | ||
| 6 | * obtaining a copy of this software and associated documentation | ||
| 7 | * files (the "Software"), to deal in the Software without | ||
| 8 | * restriction, including without limitation the rights to use, | ||
| 9 | * copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| 10 | * copies of the Software, and to permit persons to whom the | ||
| 11 | * Software is furnished to do so, subject to the following | ||
| 12 | * conditions: | ||
| 13 | * | ||
| 14 | * The above copyright notice and this permission notice shall be | ||
| 15 | * included in all copies or substantial portions of the Software. | ||
| 16 | * | ||
| 17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
| 18 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | ||
| 19 | * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
| 20 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | ||
| 21 | * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
| 22 | * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | ||
| 23 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | ||
| 24 | * OTHER DEALINGS IN THE SOFTWARE. | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include "libavutil/attributes.h" | ||
| 28 | #include "libavutil/mem_internal.h" | ||
| 29 | #include "libavutil/x86/asm.h" | ||
| 30 | #include "libavutil/x86/cpu.h" | ||
| 31 | #include "libavcodec/vc1dsp.h" | ||
| 32 | #include "constants.h" | ||
| 33 | #include "fpel.h" | ||
| 34 | #include "vc1dsp.h" | ||
| 35 | |||
| 36 | #if HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL | ||
| 37 | |||
| 38 | void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, | ||
| 39 | const uint8_t *src, x86_reg stride, | ||
| 40 | int rnd, int64_t shift); | ||
| 41 | void ff_vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride, | ||
| 42 | const int16_t *src, int rnd); | ||
| 43 | void ff_vc1_avg_hor_16b_shift2_mmxext(uint8_t *dst, x86_reg stride, | ||
| 44 | const int16_t *src, int rnd); | ||
| 45 | |||
| 46 | #define OP_PUT(S,D) | ||
| 47 | #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" | ||
| 48 | |||
| 49 | /** Add rounder from mm7 to mm3 and pack result at destination */ | ||
| 50 | #define NORMALIZE_MMX(SHIFT) \ | ||
| 51 | "paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ | ||
| 52 | "paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ | ||
| 53 | "psraw "SHIFT", %%mm3 \n\t" \ | ||
| 54 | "psraw "SHIFT", %%mm4 \n\t" | ||
| 55 | |||
| 56 | #define TRANSFER_DO_PACK(OP) \ | ||
| 57 | "packuswb %%mm4, %%mm3 \n\t" \ | ||
| 58 | OP((%2), %%mm3) \ | ||
| 59 | "movq %%mm3, (%2) \n\t" | ||
| 60 | |||
| 61 | #define TRANSFER_DONT_PACK(OP) \ | ||
| 62 | OP(0(%2), %%mm3) \ | ||
| 63 | OP(8(%2), %%mm4) \ | ||
| 64 | "movq %%mm3, 0(%2) \n\t" \ | ||
| 65 | "movq %%mm4, 8(%2) \n\t" | ||
| 66 | |||
| 67 | /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ | ||
| 68 | #define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" | ||
| 69 | #define DONT_UNPACK(reg) | ||
| 70 | |||
| 71 | /** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ | ||
| 72 | #define LOAD_ROUNDER_MMX(ROUND) \ | ||
| 73 | "movd "ROUND", %%mm7 \n\t" \ | ||
| 74 | "punpcklwd %%mm7, %%mm7 \n\t" \ | ||
| 75 | "punpckldq %%mm7, %%mm7 \n\t" | ||
| 76 | |||
| 77 | /** | ||
| 78 | * Purely vertical or horizontal 1/2 shift interpolation. | ||
| 79 | * Sacrifice mm6 for *9 factor. | ||
| 80 | */ | ||
| 81 | #define VC1_SHIFT2(OP, OPNAME)\ | ||
| 82 | static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | ||
| 83 | x86_reg stride, int rnd, x86_reg offset)\ | ||
| 84 | {\ | ||
| 85 | rnd = 8-rnd;\ | ||
| 86 | __asm__ volatile(\ | ||
| 87 | "mov $8, %%"FF_REG_c" \n\t"\ | ||
| 88 | LOAD_ROUNDER_MMX("%5")\ | ||
| 89 | "movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ | ||
| 90 | "1: \n\t"\ | ||
| 91 | "movd 0(%0 ), %%mm3 \n\t"\ | ||
| 92 | "movd 4(%0 ), %%mm4 \n\t"\ | ||
| 93 | "movd 0(%0,%2), %%mm1 \n\t"\ | ||
| 94 | "movd 4(%0,%2), %%mm2 \n\t"\ | ||
| 95 | "add %2, %0 \n\t"\ | ||
| 96 | "punpcklbw %%mm0, %%mm3 \n\t"\ | ||
| 97 | "punpcklbw %%mm0, %%mm4 \n\t"\ | ||
| 98 | "punpcklbw %%mm0, %%mm1 \n\t"\ | ||
| 99 | "punpcklbw %%mm0, %%mm2 \n\t"\ | ||
| 100 | "paddw %%mm1, %%mm3 \n\t"\ | ||
| 101 | "paddw %%mm2, %%mm4 \n\t"\ | ||
| 102 | "movd 0(%0,%3), %%mm1 \n\t"\ | ||
| 103 | "movd 4(%0,%3), %%mm2 \n\t"\ | ||
| 104 | "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ | ||
| 105 | "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ | ||
| 106 | "punpcklbw %%mm0, %%mm1 \n\t"\ | ||
| 107 | "punpcklbw %%mm0, %%mm2 \n\t"\ | ||
| 108 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ | ||
| 109 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ | ||
| 110 | "movd 0(%0,%2), %%mm1 \n\t"\ | ||
| 111 | "movd 4(%0,%2), %%mm2 \n\t"\ | ||
| 112 | "punpcklbw %%mm0, %%mm1 \n\t"\ | ||
| 113 | "punpcklbw %%mm0, %%mm2 \n\t"\ | ||
| 114 | "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ | ||
| 115 | "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ | ||
| 116 | NORMALIZE_MMX("$4")\ | ||
| 117 | "packuswb %%mm4, %%mm3 \n\t"\ | ||
| 118 | OP((%1), %%mm3)\ | ||
| 119 | "movq %%mm3, (%1) \n\t"\ | ||
| 120 | "add %6, %0 \n\t"\ | ||
| 121 | "add %4, %1 \n\t"\ | ||
| 122 | "dec %%"FF_REG_c" \n\t"\ | ||
| 123 | "jnz 1b \n\t"\ | ||
| 124 | : "+r"(src), "+r"(dst)\ | ||
| 125 | : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ | ||
| 126 | "g"(stride-offset)\ | ||
| 127 | NAMED_CONSTRAINTS_ADD(ff_pw_9)\ | ||
| 128 | : "%"FF_REG_c, "memory"\ | ||
| 129 | );\ | ||
| 130 | } | ||
| 131 | |||
| 132 | ✗ | VC1_SHIFT2(OP_PUT, put_) | |
| 133 | ✗ | VC1_SHIFT2(OP_AVG, avg_) | |
| 134 | |||
| 135 | /** | ||
| 136 | * Core of the 1/4 and 3/4 shift bicubic interpolation. | ||
| 137 | * | ||
| 138 | * @param UNPACK Macro unpacking arguments from 8 to 16 bits (can be empty). | ||
| 139 | * @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. | ||
| 140 | * @param A1 Address of 1st tap (beware of unpacked/packed). | ||
| 141 | * @param A2 Address of 2nd tap | ||
| 142 | * @param A3 Address of 3rd tap | ||
| 143 | * @param A4 Address of 4th tap | ||
| 144 | */ | ||
| 145 | #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ | ||
| 146 | MOVQ "*0+"A1", %%mm1 \n\t" \ | ||
| 147 | MOVQ "*4+"A1", %%mm2 \n\t" \ | ||
| 148 | UNPACK("%%mm1") \ | ||
| 149 | UNPACK("%%mm2") \ | ||
| 150 | "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ | ||
| 151 | "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ | ||
| 152 | MOVQ "*0+"A2", %%mm3 \n\t" \ | ||
| 153 | MOVQ "*4+"A2", %%mm4 \n\t" \ | ||
| 154 | UNPACK("%%mm3") \ | ||
| 155 | UNPACK("%%mm4") \ | ||
| 156 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | ||
| 157 | "pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ | ||
| 158 | "psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ | ||
| 159 | "psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ | ||
| 160 | MOVQ "*0+"A4", %%mm1 \n\t" \ | ||
| 161 | MOVQ "*4+"A4", %%mm2 \n\t" \ | ||
| 162 | UNPACK("%%mm1") \ | ||
| 163 | UNPACK("%%mm2") \ | ||
| 164 | "psllw $2, %%mm1 \n\t" /* 4* */ \ | ||
| 165 | "psllw $2, %%mm2 \n\t" /* 4* */ \ | ||
| 166 | "psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ | ||
| 167 | "psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ | ||
| 168 | MOVQ "*0+"A3", %%mm1 \n\t" \ | ||
| 169 | MOVQ "*4+"A3", %%mm2 \n\t" \ | ||
| 170 | UNPACK("%%mm1") \ | ||
| 171 | UNPACK("%%mm2") \ | ||
| 172 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | ||
| 173 | "pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ | ||
| 174 | "paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ | ||
| 175 | "paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ | ||
| 176 | |||
| 177 | /** | ||
| 178 | * Macro to build the vertical 16 bits version of vc1_put_shift[13]. | ||
| 179 | * Here, offset=src_stride. Parameters passed A1 to A4 must use | ||
| 180 | * %3 (src_stride) and %4 (3*src_stride). | ||
| 181 | * | ||
| 182 | * @param NAME Either 1 or 3 | ||
| 183 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | ||
| 184 | */ | ||
| 185 | #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ | ||
| 186 | static void \ | ||
| 187 | vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ | ||
| 188 | x86_reg src_stride, \ | ||
| 189 | int rnd, int64_t shift) \ | ||
| 190 | { \ | ||
| 191 | int h = 8; \ | ||
| 192 | src -= src_stride; \ | ||
| 193 | __asm__ volatile( \ | ||
| 194 | LOAD_ROUNDER_MMX("%5") \ | ||
| 195 | "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ | ||
| 196 | "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ | ||
| 197 | ".p2align 3 \n\t" \ | ||
| 198 | "1: \n\t" \ | ||
| 199 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | ||
| 200 | NORMALIZE_MMX("%6") \ | ||
| 201 | TRANSFER_DONT_PACK(OP_PUT) \ | ||
| 202 | /* Last 3 (in fact 4) bytes on the line */ \ | ||
| 203 | "movd 8+"A1", %%mm1 \n\t" \ | ||
| 204 | DO_UNPACK("%%mm1") \ | ||
| 205 | "movq %%mm1, %%mm3 \n\t" \ | ||
| 206 | "paddw %%mm1, %%mm1 \n\t" \ | ||
| 207 | "paddw %%mm3, %%mm1 \n\t" /* 3* */ \ | ||
| 208 | "movd 8+"A2", %%mm3 \n\t" \ | ||
| 209 | DO_UNPACK("%%mm3") \ | ||
| 210 | "pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ | ||
| 211 | "psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ | ||
| 212 | "movd 8+"A3", %%mm1 \n\t" \ | ||
| 213 | DO_UNPACK("%%mm1") \ | ||
| 214 | "pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ | ||
| 215 | "paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ | ||
| 216 | "movd 8+"A4", %%mm1 \n\t" \ | ||
| 217 | DO_UNPACK("%%mm1") \ | ||
| 218 | "psllw $2, %%mm1 \n\t" /* 4* */ \ | ||
| 219 | "psubw %%mm1, %%mm3 \n\t" \ | ||
| 220 | "paddw %%mm7, %%mm3 \n\t" \ | ||
| 221 | "psraw %6, %%mm3 \n\t" \ | ||
| 222 | "movq %%mm3, 16(%2) \n\t" \ | ||
| 223 | "add %3, %1 \n\t" \ | ||
| 224 | "add $24, %2 \n\t" \ | ||
| 225 | "decl %0 \n\t" \ | ||
| 226 | "jnz 1b \n\t" \ | ||
| 227 | : "+r"(h), "+r" (src), "+r" (dst) \ | ||
| 228 | : "r"(src_stride), "r"(3*src_stride), \ | ||
| 229 | "m"(rnd), "m"(shift) \ | ||
| 230 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18) \ | ||
| 231 | : "memory" \ | ||
| 232 | ); \ | ||
| 233 | } | ||
| 234 | |||
| 235 | /** | ||
| 236 | * Macro to build the horizontal 16 bits version of vc1_put_shift[13]. | ||
| 237 | * Here, offset=16 bits, so parameters passed A1 to A4 should be simple. | ||
| 238 | * | ||
| 239 | * @param NAME Either 1 or 3 | ||
| 240 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | ||
| 241 | */ | ||
| 242 | #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ | ||
| 243 | static void \ | ||
| 244 | OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ | ||
| 245 | const int16_t *src, int rnd) \ | ||
| 246 | { \ | ||
| 247 | int h = 8; \ | ||
| 248 | src -= 1; \ | ||
| 249 | rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ | ||
| 250 | __asm__ volatile( \ | ||
| 251 | LOAD_ROUNDER_MMX("%4") \ | ||
| 252 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | ||
| 253 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | ||
| 254 | ".p2align 3 \n\t" \ | ||
| 255 | "1: \n\t" \ | ||
| 256 | MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ | ||
| 257 | NORMALIZE_MMX("$7") \ | ||
| 258 | /* Remove bias */ \ | ||
| 259 | "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ | ||
| 260 | "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ | ||
| 261 | TRANSFER_DO_PACK(OP) \ | ||
| 262 | "add $24, %1 \n\t" \ | ||
| 263 | "add %3, %2 \n\t" \ | ||
| 264 | "decl %0 \n\t" \ | ||
| 265 | "jnz 1b \n\t" \ | ||
| 266 | : "+r"(h), "+r" (src), "+r" (dst) \ | ||
| 267 | : "r"(stride), "m"(rnd) \ | ||
| 268 | NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128) \ | ||
| 269 | : "memory" \ | ||
| 270 | ); \ | ||
| 271 | } | ||
| 272 | |||
| 273 | /** | ||
| 274 | * Macro to build the 8 bits, any direction, version of vc1_put_shift[13]. | ||
| 275 | * Here, offset=src_stride. Parameters passed A1 to A4 must use | ||
| 276 | * %3 (offset) and %4 (3*offset). | ||
| 277 | * | ||
| 278 | * @param NAME Either 1 or 3 | ||
| 279 | * @see MSPEL_FILTER13_CORE for information on A1->A4 | ||
| 280 | */ | ||
| 281 | #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ | ||
| 282 | static void \ | ||
| 283 | OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ | ||
| 284 | x86_reg stride, int rnd, x86_reg offset) \ | ||
| 285 | { \ | ||
| 286 | int h = 8; \ | ||
| 287 | src -= offset; \ | ||
| 288 | rnd = 32-rnd; \ | ||
| 289 | __asm__ volatile ( \ | ||
| 290 | LOAD_ROUNDER_MMX("%6") \ | ||
| 291 | "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ | ||
| 292 | "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ | ||
| 293 | ".p2align 3 \n\t" \ | ||
| 294 | "1: \n\t" \ | ||
| 295 | MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ | ||
| 296 | NORMALIZE_MMX("$6") \ | ||
| 297 | TRANSFER_DO_PACK(OP) \ | ||
| 298 | "add %5, %1 \n\t" \ | ||
| 299 | "add %5, %2 \n\t" \ | ||
| 300 | "decl %0 \n\t" \ | ||
| 301 | "jnz 1b \n\t" \ | ||
| 302 | : "+r"(h), "+r" (src), "+r" (dst) \ | ||
| 303 | : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ | ||
| 304 | NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3) \ | ||
| 305 | : "memory" \ | ||
| 306 | ); \ | ||
| 307 | } | ||
| 308 | |||
| 309 | /** 1/4 shift bicubic interpolation */ | ||
| 310 | ✗ | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) | |
| 311 | ✗ | MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) | |
| 312 | ✗ | MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") | |
| 313 | ✗ | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) | |
| 314 | ✗ | MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) | |
| 315 | |||
| 316 | /** 3/4 shift bicubic interpolation */ | ||
| 317 | ✗ | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) | |
| 318 | ✗ | MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) | |
| 319 | ✗ | MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") | |
| 320 | ✗ | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) | |
| 321 | ✗ | MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) | |
| 322 | |||
| 323 | typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); | ||
| 324 | typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); | ||
| 325 | typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); | ||
| 326 | |||
| 327 | /** | ||
| 328 | * Interpolate fractional pel values by applying proper vertical then | ||
| 329 | * horizontal filter. | ||
| 330 | * | ||
| 331 | * @param dst Destination buffer for interpolated pels. | ||
| 332 | * @param src Source buffer. | ||
| 333 | * @param stride Stride for both src and dst buffers. | ||
| 334 | * @param hmode Horizontal filter (expressed in quarter pixels shift). | ||
| 335 | * @param hmode Vertical filter. | ||
| 336 | * @param rnd Rounding bias. | ||
| 337 | */ | ||
| 338 | #define VC1_MSPEL_MC(OP, INSTR)\ | ||
| 339 | static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ | ||
| 340 | int hmode, int vmode, int rnd)\ | ||
| 341 | {\ | ||
| 342 | static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ | ||
| 343 | { NULL, vc1_put_ver_16b_shift1_mmx, ff_vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ | ||
| 344 | static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ | ||
| 345 | { NULL, OP ## vc1_hor_16b_shift1_mmx, ff_vc1_ ## OP ## hor_16b_shift2_ ## INSTR, OP ## vc1_hor_16b_shift3_mmx };\ | ||
| 346 | static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ | ||
| 347 | { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ | ||
| 348 | \ | ||
| 349 | __asm__ volatile(\ | ||
| 350 | "pxor %%mm0, %%mm0 \n\t"\ | ||
| 351 | ::: "memory"\ | ||
| 352 | );\ | ||
| 353 | \ | ||
| 354 | if (vmode) { /* Vertical filter to apply */\ | ||
| 355 | if (hmode) { /* Horizontal filter to apply, output to tmp */\ | ||
| 356 | static const int shift_value[] = { 0, 5, 1, 5 };\ | ||
| 357 | int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ | ||
| 358 | int r;\ | ||
| 359 | LOCAL_ALIGNED(16, int16_t, tmp, [12*8]);\ | ||
| 360 | \ | ||
| 361 | r = (1<<(shift-1)) + rnd-1;\ | ||
| 362 | vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ | ||
| 363 | \ | ||
| 364 | vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ | ||
| 365 | return;\ | ||
| 366 | }\ | ||
| 367 | else { /* No horizontal filter, output 8 lines to dst */\ | ||
| 368 | vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ | ||
| 369 | return;\ | ||
| 370 | }\ | ||
| 371 | }\ | ||
| 372 | \ | ||
| 373 | /* Horizontal mode with no vertical mode */\ | ||
| 374 | vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ | ||
| 375 | } \ | ||
| 376 | static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ | ||
| 377 | int stride, int hmode, int vmode, int rnd)\ | ||
| 378 | { \ | ||
| 379 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ | ||
| 380 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ | ||
| 381 | dst += 8*stride; src += 8*stride; \ | ||
| 382 | OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ | ||
| 383 | OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ | ||
| 384 | } | ||
| 385 | |||
| 386 | ✗ | VC1_MSPEL_MC(put_, mmx) | |
| 387 | ✗ | VC1_MSPEL_MC(avg_, mmxext) | |
| 388 | |||
| 389 | /** Macro to ease bicubic filter interpolation functions declarations */ | ||
| 390 | #define DECLARE_FUNCTION(a, b) \ | ||
| 391 | static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ | ||
| 392 | const uint8_t *src, \ | ||
| 393 | ptrdiff_t stride, \ | ||
| 394 | int rnd) \ | ||
| 395 | { \ | ||
| 396 | put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | ||
| 397 | }\ | ||
| 398 | static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ | ||
| 399 | const uint8_t *src, \ | ||
| 400 | ptrdiff_t stride, \ | ||
| 401 | int rnd) \ | ||
| 402 | { \ | ||
| 403 | avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ | ||
| 404 | }\ | ||
| 405 | static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst, \ | ||
| 406 | const uint8_t *src, \ | ||
| 407 | ptrdiff_t stride, \ | ||
| 408 | int rnd) \ | ||
| 409 | { \ | ||
| 410 | put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ | ||
| 411 | }\ | ||
| 412 | static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst, \ | ||
| 413 | const uint8_t *src,\ | ||
| 414 | ptrdiff_t stride, \ | ||
| 415 | int rnd) \ | ||
| 416 | { \ | ||
| 417 | avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \ | ||
| 418 | } | ||
| 419 | |||
| 420 | ✗ | DECLARE_FUNCTION(0, 1) | |
| 421 | ✗ | DECLARE_FUNCTION(0, 2) | |
| 422 | ✗ | DECLARE_FUNCTION(0, 3) | |
| 423 | |||
| 424 | ✗ | DECLARE_FUNCTION(1, 0) | |
| 425 | ✗ | DECLARE_FUNCTION(1, 1) | |
| 426 | ✗ | DECLARE_FUNCTION(1, 2) | |
| 427 | ✗ | DECLARE_FUNCTION(1, 3) | |
| 428 | |||
| 429 | ✗ | DECLARE_FUNCTION(2, 0) | |
| 430 | ✗ | DECLARE_FUNCTION(2, 1) | |
| 431 | ✗ | DECLARE_FUNCTION(2, 2) | |
| 432 | ✗ | DECLARE_FUNCTION(2, 3) | |
| 433 | |||
| 434 | ✗ | DECLARE_FUNCTION(3, 0) | |
| 435 | ✗ | DECLARE_FUNCTION(3, 1) | |
| 436 | ✗ | DECLARE_FUNCTION(3, 2) | |
| 437 | ✗ | DECLARE_FUNCTION(3, 3) | |
| 438 | |||
| 439 | #define FN_ASSIGN(OP, X, Y, INSN) \ | ||
| 440 | dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ | ||
| 441 | dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN | ||
| 442 | |||
| 443 | 65 | av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) | |
| 444 | { | ||
| 445 | 65 | FN_ASSIGN(put_, 0, 1, _mmx); | |
| 446 | 65 | FN_ASSIGN(put_, 0, 2, _mmx); | |
| 447 | 65 | FN_ASSIGN(put_, 0, 3, _mmx); | |
| 448 | |||
| 449 | 65 | FN_ASSIGN(put_, 1, 0, _mmx); | |
| 450 | 65 | FN_ASSIGN(put_, 1, 1, _mmx); | |
| 451 | 65 | FN_ASSIGN(put_, 1, 2, _mmx); | |
| 452 | 65 | FN_ASSIGN(put_, 1, 3, _mmx); | |
| 453 | |||
| 454 | 65 | FN_ASSIGN(put_, 2, 0, _mmx); | |
| 455 | 65 | FN_ASSIGN(put_, 2, 1, _mmx); | |
| 456 | 65 | FN_ASSIGN(put_, 2, 2, _mmx); | |
| 457 | 65 | FN_ASSIGN(put_, 2, 3, _mmx); | |
| 458 | |||
| 459 | 65 | FN_ASSIGN(put_, 3, 0, _mmx); | |
| 460 | 65 | FN_ASSIGN(put_, 3, 1, _mmx); | |
| 461 | 65 | FN_ASSIGN(put_, 3, 2, _mmx); | |
| 462 | 65 | FN_ASSIGN(put_, 3, 3, _mmx); | |
| 463 | 65 | } | |
| 464 | |||
| 465 | 60 | av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) | |
| 466 | { | ||
| 467 | 60 | FN_ASSIGN(avg_, 0, 1, _mmxext); | |
| 468 | 60 | FN_ASSIGN(avg_, 0, 2, _mmxext); | |
| 469 | 60 | FN_ASSIGN(avg_, 0, 3, _mmxext); | |
| 470 | |||
| 471 | 60 | FN_ASSIGN(avg_, 1, 0, _mmxext); | |
| 472 | 60 | FN_ASSIGN(avg_, 1, 1, _mmxext); | |
| 473 | 60 | FN_ASSIGN(avg_, 1, 2, _mmxext); | |
| 474 | 60 | FN_ASSIGN(avg_, 1, 3, _mmxext); | |
| 475 | |||
| 476 | 60 | FN_ASSIGN(avg_, 2, 0, _mmxext); | |
| 477 | 60 | FN_ASSIGN(avg_, 2, 1, _mmxext); | |
| 478 | 60 | FN_ASSIGN(avg_, 2, 2, _mmxext); | |
| 479 | 60 | FN_ASSIGN(avg_, 2, 3, _mmxext); | |
| 480 | |||
| 481 | 60 | FN_ASSIGN(avg_, 3, 0, _mmxext); | |
| 482 | 60 | FN_ASSIGN(avg_, 3, 1, _mmxext); | |
| 483 | 60 | FN_ASSIGN(avg_, 3, 2, _mmxext); | |
| 484 | 60 | FN_ASSIGN(avg_, 3, 3, _mmxext); | |
| 485 | 60 | } | |
| 486 | #endif /* HAVE_6REGS && HAVE_INLINE_ASM && HAVE_MMX_EXTERNAL */ | ||
| 487 |