| Line | Branch | Exec | Source |
|---|---|---|---|
| 1 | /* | ||
| 2 | * This file is part of FFmpeg. | ||
| 3 | * | ||
| 4 | * FFmpeg is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU Lesser General Public | ||
| 6 | * License as published by the Free Software Foundation; either | ||
| 7 | * version 2.1 of the License, or (at your option) any later version. | ||
| 8 | * | ||
| 9 | * FFmpeg is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 12 | * Lesser General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU Lesser General Public | ||
| 15 | * License along with FFmpeg; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
| 17 | */ | ||
| 18 | |||
| 19 | #include "config.h" | ||
| 20 | #include "libavutil/attributes.h" | ||
| 21 | #include "libavutil/cpu.h" | ||
| 22 | #include "libavutil/mem_internal.h" | ||
| 23 | #include "libavutil/x86/asm.h" | ||
| 24 | #include "libavutil/x86/cpu.h" | ||
| 25 | #include "libavcodec/mpeg4videodsp.h" | ||
| 26 | #include "videodsp.h" | ||
| 27 | |||
| 28 | #if HAVE_SSSE3_INLINE | ||
| 29 | |||
| 30 | #define SPLATW(reg) "pshuflw $0, %%" #reg ", %%" #reg "\n\t" \ | ||
| 31 | "punpcklqdq %%" #reg ", %%" #reg "\n\t" | ||
| 32 | |||
| 33 | typedef struct { | ||
| 34 | DECLARE_ALIGNED_16(uint16_t, u16)[8]; | ||
| 35 | } xmm_u16; | ||
| 36 | |||
| 37 | DECLARE_ASM_CONST(16, xmm_u16, pw_0to7) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; | ||
| 38 | |||
| 39 | 2 | static void gmc_ssse3(uint8_t *dst, const uint8_t *src, | |
| 40 | int stride, int h, int ox, int oy, | ||
| 41 | int dxx, int dxy, int dyx, int dyy, | ||
| 42 | int shift, int r, int width, int height) | ||
| 43 | { | ||
| 44 | enum { | ||
| 45 | W = 8, | ||
| 46 | EDGE_EMU_STRIDE = 16, //< anything >= W+1 will do | ||
| 47 | MAX_H = 16, | ||
| 48 | }; | ||
| 49 | 2 | const int w = 8; | |
| 50 | 2 | const int ix = ox >> (16 + shift); | |
| 51 | 2 | const int iy = oy >> (16 + shift); | |
| 52 | 2 | const int ox2 = ox & (1 << (16 + shift)) - 1; | |
| 53 | 2 | const int oy2 = oy & (1 << (16 + shift)) - 1; | |
| 54 | 2 | const int oxs = ox2 >> 4; | |
| 55 | 2 | const int oys = oy2 >> 4; | |
| 56 | 2 | const int dxx2 = dxx - (1 << (16 + shift)); | |
| 57 | 2 | const int dyy2 = dyy - (1 << (16 + shift)); | |
| 58 | 2 | const int dxxs = dxx2 >> 4; | |
| 59 | 2 | const int dxys = dxy >> 4; | |
| 60 | 2 | const int dyxs = dyx >> 4; | |
| 61 | 2 | const int dyys = dyy2 >> 4; | |
| 62 | |||
| 63 | 2 | const int dxw = dxx2 * (w - 1); | |
| 64 | 2 | const int dyh = dyy2 * (h - 1); | |
| 65 | 2 | const int dxh = dxy * (h - 1); | |
| 66 | 2 | const int dyw = dyx * (w - 1); | |
| 67 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | int need_emu = (unsigned) ix >= width - w || width < w || |
| 68 |
4/6✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
✓ Branch 2 taken 1 times.
✓ Branch 3 taken 1 times.
✗ Branch 4 not taken.
✓ Branch 5 taken 1 times.
|
4 | (unsigned) iy >= height - h || height< h |
| 69 | ; | ||
| 70 | |||
| 71 | 2 | if ( // non-constant fullpel offset (3% of blocks) | |
| 72 | 2 | ((ox2 + dxw) | (ox2 + dxh) | (ox2 + dxw + dxh) | | |
| 73 |
1/2✓ Branch 0 taken 2 times.
✗ Branch 1 not taken.
|
2 | (oy2 + dyw) | (oy2 + dyh) | (oy2 + dyw + dyh)) >> (16 + shift) || |
| 74 | // uses more than 16 bits of subpel mv (only at huge resolution) | ||
| 75 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2 times.
|
2 | (dxx | dxy | dyx | dyy) & 15 || |
| 76 | (!HAVE_SSE2_EXTERNAL && need_emu)) { | ||
| 77 | ✗ | ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, | |
| 78 | shift, r, width, height); | ||
| 79 | ✗ | return; | |
| 80 | } | ||
| 81 | |||
| 82 | 2 | src += ix + iy * stride; | |
| 83 | 2 | const ptrdiff_t dst_stride = stride; | |
| 84 | 2 | ptrdiff_t src_stride = stride; | |
| 85 | #if HAVE_SSE2_EXTERNAL | ||
| 86 | uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE]; | ||
| 87 |
2/2✓ Branch 0 taken 1 times.
✓ Branch 1 taken 1 times.
|
2 | if (need_emu) { |
| 88 | 1 | ff_emulated_edge_mc_sse2(edge_buf, src, EDGE_EMU_STRIDE, src_stride, | |
| 89 | w + 1, h + 1, ix, iy, width, height); | ||
| 90 | 1 | src = edge_buf; | |
| 91 | 1 | src_stride = EDGE_EMU_STRIDE; | |
| 92 | } | ||
| 93 | #endif | ||
| 94 | |||
| 95 | #if ARCH_X86_32 | ||
| 96 | xmm_u16 dxy8, dyy8, r8; | ||
| 97 | DECLARE_ALIGNED_16(uint64_t, shift2) = 2 * shift; | ||
| 98 | #endif | ||
| 99 | |||
| 100 | 2 | __asm__ volatile ( | |
| 101 | "movd %[dxxs], %%xmm2 \n\t" | ||
| 102 | "movd %[dyxs], %%xmm3 \n\t" | ||
| 103 | "movd %[oxs], %%xmm1 \n\t" | ||
| 104 | SPLATW(xmm2) | ||
| 105 | "movd %[oys], %%xmm7 \n\t" | ||
| 106 | SPLATW(xmm3) | ||
| 107 | "pmullw "MANGLE(pw_0to7)", %%xmm2 \n\t" | ||
| 108 | SPLATW(xmm1) | ||
| 109 | "movd %[s], %%xmm6 \n\t" | ||
| 110 | "pmullw "MANGLE(pw_0to7)", %%xmm3 \n\t" | ||
| 111 | "movq (%[src]), %%xmm5 \n\t" | ||
| 112 | SPLATW(xmm7) | ||
| 113 | #if ARCH_X86_32 | ||
| 114 | "movd %[dxys], %%xmm0 \n\t" | ||
| 115 | #else | ||
| 116 | "movd %[dxys], %%xmm11 \n\t" | ||
| 117 | #endif | ||
| 118 | "paddw %%xmm2, %%xmm1 \n\t" | ||
| 119 | "movq 1(%[src]), %%xmm2 \n\t" | ||
| 120 | SPLATW(xmm6) | ||
| 121 | #if ARCH_X86_32 | ||
| 122 | "movd %[dyys], %%xmm4 \n\t" | ||
| 123 | #else | ||
| 124 | "movd %[dyys], %%xmm9 \n\t" | ||
| 125 | #endif | ||
| 126 | "paddw %%xmm3, %%xmm7 \n\t" | ||
| 127 | "punpcklbw %%xmm2, %%xmm5 \n\t" | ||
| 128 | #if ARCH_X86_32 | ||
| 129 | SPLATW(xmm0) | ||
| 130 | "movd %[r], %%xmm2 \n\t" | ||
| 131 | SPLATW(xmm4) | ||
| 132 | "movdqa %%xmm0, %[dxy8] \n\t" | ||
| 133 | SPLATW(xmm2) | ||
| 134 | "movdqa %%xmm4, %[dyy8] \n\t" | ||
| 135 | "movdqa %%xmm2, %[r8] \n\t" | ||
| 136 | #else | ||
| 137 | SPLATW(xmm11) | ||
| 138 | "movd %[r], %%xmm8 \n\t" | ||
| 139 | SPLATW(xmm9) | ||
| 140 | SPLATW(xmm8) | ||
| 141 | "movd %[shift2], %%xmm12 \n\t" | ||
| 142 | #endif | ||
| 143 | |||
| 144 | "1: \n\t" | ||
| 145 | "add %[src_stride], %[src] \n\t" | ||
| 146 | "movq (%[src]), %%xmm3 \n\t" | ||
| 147 | "movq 1(%[src]), %%xmm0 \n\t" | ||
| 148 | "movdqa %%xmm1, %%xmm4 \n\t" | ||
| 149 | "psrlw $12, %%xmm4 \n\t" // dx | ||
| 150 | "movdqa %%xmm6, %%xmm2 \n\t" | ||
| 151 | "psubw %%xmm4, %%xmm2 \n\t" // (s-dx) | ||
| 152 | "psllw $8, %%xmm4 \n\t" | ||
| 153 | "por %%xmm4, %%xmm2 \n\t" // s-dx,dx,s-dx,dx (bytes) | ||
| 154 | "pmaddubsw %%xmm2, %%xmm5 \n\t" // src[0, 0] * (s - dx) + src[1,0] * dx | ||
| 155 | "punpcklbw %%xmm0, %%xmm3 \n\t" | ||
| 156 | "movdqa %%xmm3, %%xmm0 \n\t" | ||
| 157 | "pmaddubsw %%xmm2, %%xmm3 \n\t" // src[0, 1] * (s - dx) + src[1,1] * dx | ||
| 158 | #if ARCH_X86_32 | ||
| 159 | "paddw %[dxy8], %%xmm1 \n\t" | ||
| 160 | #else | ||
| 161 | "paddw %%xmm11, %%xmm1 \n\t" | ||
| 162 | #endif | ||
| 163 | "movdqa %%xmm7, %%xmm4 \n\t" | ||
| 164 | "movdqa %%xmm6, %%xmm2 \n\t" | ||
| 165 | "psrlw $12, %%xmm4 \n\t" // dy | ||
| 166 | "psubw %%xmm4, %%xmm2 \n\t" // (s-dy) | ||
| 167 | "pmullw %%xmm5, %%xmm2 \n\t" // (src[0, 0] * (s - dx) + src[1,0] * dx) * (s - dy) | ||
| 168 | #if ARCH_X86_32 | ||
| 169 | "paddw %[dyy8], %%xmm7 \n\t" | ||
| 170 | #else | ||
| 171 | "paddw %%xmm9, %%xmm7 \n\t" | ||
| 172 | #endif | ||
| 173 | "pmullw %%xmm3, %%xmm4 \n\t" // (src[0, 1] * (s - dx) + src[1,1] * dx) * dy | ||
| 174 | |||
| 175 | #if ARCH_X86_32 | ||
| 176 | "paddw %[r8], %%xmm2 \n\t" | ||
| 177 | #else | ||
| 178 | "paddw %%xmm8, %%xmm2 \n\t" | ||
| 179 | #endif | ||
| 180 | "paddw %%xmm2, %%xmm4 \n\t" | ||
| 181 | |||
| 182 | #if ARCH_X86_32 | ||
| 183 | "psrlw %[shift2], %%xmm4 \n\t" | ||
| 184 | #else | ||
| 185 | "psrlw %%xmm12, %%xmm4 \n\t" | ||
| 186 | #endif | ||
| 187 | "packuswb %%xmm4, %%xmm4 \n\t" | ||
| 188 | "movq %%xmm4, (%[dst]) \n\t" | ||
| 189 | "movdqa %%xmm0, %%xmm5 \n\t" | ||
| 190 | "add %[dst_stride], %[dst] \n\t" | ||
| 191 | |||
| 192 | "decl %[h] \n\t" | ||
| 193 | "jnz 1b \n\t" | ||
| 194 | : [dst]"+r"(dst), [src]"+r"(src), | ||
| 195 | #if HAVE_6REGS || HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS | ||
| 196 | [h]"+r"(h) | ||
| 197 | #else | ||
| 198 | [h]"+m"(h) | ||
| 199 | #endif | ||
| 200 | #if ARCH_X86_32 | ||
| 201 | , [dxy8]"=m" (dxy8), [dyy8]"=m" (dyy8), [r8]"=m" (r8) | ||
| 202 | #endif | ||
| 203 | : [dst_stride]"r"(dst_stride), [src_stride]"r"(src_stride), | ||
| 204 | 2 | [s]"g" (1 << shift), | |
| 205 | #if ARCH_X86_32 | ||
| 206 | [shift2]"m" (shift2), | ||
| 207 | #else | ||
| 208 | 2 | [shift2]"g" (2*shift), | |
| 209 | #endif | ||
| 210 | [oxs]"g"(oxs), [oys]"g"(oys), [dxxs]"g"(dxxs), [dyxs]"g"(dyxs), | ||
| 211 | [dxys]"g"(dxys), [dyys]"g"(dyys), [r]"g"(r) NAMED_CONSTRAINTS_ADD(pw_0to7) | ||
| 212 | : XMM_CLOBBERS("xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",) | ||
| 213 | #if ARCH_X86_64 | ||
| 214 | XMM_CLOBBERS("xmm8", "xmm9", "xmm10", "xmm11", "xmm12",) | ||
| 215 | #endif | ||
| 216 | "memory"); | ||
| 217 | } | ||
| 218 | |||
| 219 | #endif /* HAVE_SSSE3_INLINE */ | ||
| 220 | |||
| 221 | 201 | av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c) | |
| 222 | { | ||
| 223 | #if HAVE_SSSE3_INLINE | ||
| 224 | 201 | int cpu_flags = av_get_cpu_flags(); | |
| 225 | |||
| 226 |
2/2✓ Branch 0 taken 42 times.
✓ Branch 1 taken 159 times.
|
201 | if (INLINE_SSSE3(cpu_flags)) |
| 227 | 42 | c->gmc = gmc_ssse3; | |
| 228 | #endif /* HAVE_SSSE3_INLINE */ | ||
| 229 | 201 | } | |
| 230 |