FFmpeg coverage


Directory: ../../../ffmpeg/
File: src/libavcodec/x86/snowdsp_init.c
Date: 2026-05-03 03:13:14
Exec Total Coverage
Lines: 54 59 91.5%
Functions: 3 3 100.0%
Branches: 17 18 94.4%

Line Branch Exec Source
1 /*
2 * ASM optimized Snow DSP utils
3 * Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include <stdint.h>
23 #include "config.h"
24 #include "libavutil/attributes.h"
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/snow_dwt.h"
29
30 void ff_snow_inner_add_yblock_ssse3(const uint8_t *obmc, const int obmc_stride,
31 uint8_t **block, int b_w, int b_h, int src_x,
32 int src_stride, IDWTELEM *const *lines,
33 int add, uint8_t *dst8);
34
35 #if HAVE_INLINE_ASM
36
37 248 static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
38 248 const int w2= (width+1)>>1;
39 248 const int w_l= (width>>1);
40 248 const int w_r= w2 - 1;
41 int i;
42
43 { // Lift 0
44 248 IDWTELEM * const ref = b + w2 - 1;
45
46 248 i = 1;
47 248 b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
48 248 __asm__ volatile(
49 "pcmpeqw %%mm7, %%mm7 \n\t"
50 "pcmpeqw %%mm3, %%mm3 \n\t"
51 "psllw $1, %%mm3 \n\t"
52 "paddw %%mm7, %%mm3 \n\t"
53 "psllw $13, %%mm3 \n\t"
54 ::);
55
2/2
✓ Branch 0 taken 784 times.
✓ Branch 1 taken 248 times.
1032 for(; i<w_l-7; i+=8){
56 784 __asm__ volatile(
57 "movq (%1), %%mm2 \n\t"
58 "movq 8(%1), %%mm6 \n\t"
59 "paddw 2(%1), %%mm2 \n\t"
60 "paddw 10(%1), %%mm6 \n\t"
61 "paddw %%mm7, %%mm2 \n\t"
62 "paddw %%mm7, %%mm6 \n\t"
63 "pmulhw %%mm3, %%mm2 \n\t"
64 "pmulhw %%mm3, %%mm6 \n\t"
65 "paddw (%0), %%mm2 \n\t"
66 "paddw 8(%0), %%mm6 \n\t"
67 "movq %%mm2, (%0) \n\t"
68 "movq %%mm6, 8(%0) \n\t"
69 784 :: "r"(&b[i]), "r"(&ref[i])
70 : "memory"
71 );
72 }
73 248 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
74 }
75
76 { // Lift 1
77 248 IDWTELEM * const dst = b+w2;
78
79 248 i = 0;
80
2/2
✓ Branch 0 taken 784 times.
✓ Branch 1 taken 248 times.
1032 for(; i<w_r-7; i+=8){
81 784 __asm__ volatile(
82 "movq (%1), %%mm2 \n\t"
83 "movq 8(%1), %%mm6 \n\t"
84 "paddw 2(%1), %%mm2 \n\t"
85 "paddw 10(%1), %%mm6 \n\t"
86 "movq (%0), %%mm0 \n\t"
87 "movq 8(%0), %%mm4 \n\t"
88 "psubw %%mm2, %%mm0 \n\t"
89 "psubw %%mm6, %%mm4 \n\t"
90 "movq %%mm0, (%0) \n\t"
91 "movq %%mm4, 8(%0) \n\t"
92 784 :: "r"(&dst[i]), "r"(&b[i])
93 : "memory"
94 );
95 }
96 248 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
97 }
98
99 { // Lift 2
100 248 IDWTELEM * const ref = b+w2 - 1;
101
102 248 i = 1;
103 248 b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
104 248 __asm__ volatile(
105 "psllw $15, %%mm7 \n\t"
106 "pcmpeqw %%mm6, %%mm6 \n\t"
107 "psrlw $13, %%mm6 \n\t"
108 "paddw %%mm7, %%mm6 \n\t"
109 ::);
110
2/2
✓ Branch 0 taken 784 times.
✓ Branch 1 taken 248 times.
1032 for(; i<w_l-7; i+=8){
111 784 __asm__ volatile(
112 "movq (%1), %%mm0 \n\t"
113 "movq 8(%1), %%mm4 \n\t"
114 "movq 2(%1), %%mm1 \n\t"
115 "movq 10(%1), %%mm5 \n\t"
116 "paddw %%mm6, %%mm0 \n\t"
117 "paddw %%mm6, %%mm4 \n\t"
118 "paddw %%mm7, %%mm1 \n\t"
119 "paddw %%mm7, %%mm5 \n\t"
120 "pavgw %%mm1, %%mm0 \n\t"
121 "pavgw %%mm5, %%mm4 \n\t"
122 "psubw %%mm7, %%mm0 \n\t"
123 "psubw %%mm7, %%mm4 \n\t"
124 "psraw $1, %%mm0 \n\t"
125 "psraw $1, %%mm4 \n\t"
126 "movq (%0), %%mm1 \n\t"
127 "movq 8(%0), %%mm5 \n\t"
128 "paddw %%mm1, %%mm0 \n\t"
129 "paddw %%mm5, %%mm4 \n\t"
130 "psraw $2, %%mm0 \n\t"
131 "psraw $2, %%mm4 \n\t"
132 "paddw %%mm1, %%mm0 \n\t"
133 "paddw %%mm5, %%mm4 \n\t"
134 "movq %%mm0, (%0) \n\t"
135 "movq %%mm4, 8(%0) \n\t"
136 784 :: "r"(&b[i]), "r"(&ref[i])
137 : "memory"
138 );
139 }
140 248 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
141 }
142
143 { // Lift 3
144 248 IDWTELEM * const src = b+w2;
145 248 i = 0;
146
147
2/2
✓ Branch 0 taken 784 times.
✓ Branch 1 taken 248 times.
1032 for(; i<w_r-7; i+=8){
148 784 __asm__ volatile(
149 "movq 2(%1), %%mm2 \n\t"
150 "movq 10(%1), %%mm6 \n\t"
151 "paddw (%1), %%mm2 \n\t"
152 "paddw 8(%1), %%mm6 \n\t"
153 "movq (%0), %%mm0 \n\t"
154 "movq 8(%0), %%mm4 \n\t"
155 "paddw %%mm2, %%mm0 \n\t"
156 "paddw %%mm6, %%mm4 \n\t"
157 "psraw $1, %%mm2 \n\t"
158 "psraw $1, %%mm6 \n\t"
159 "paddw %%mm0, %%mm2 \n\t"
160 "paddw %%mm4, %%mm6 \n\t"
161 "movq %%mm2, (%2) \n\t"
162 "movq %%mm6, 8(%2) \n\t"
163 784 :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
164 : "memory"
165 );
166 }
167 248 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
168 }
169
170 {
171 248 snow_interleave_line_header(&i, width, b, temp);
172
173
2/2
✓ Branch 0 taken 248 times.
✓ Branch 1 taken 248 times.
496 for (; (i & 0x1E) != 0x1E; i-=2){
174 248 b[i+1] = temp[i>>1];
175 248 b[i] = b[i>>1];
176 }
177
2/2
✓ Branch 0 taken 496 times.
✓ Branch 1 taken 248 times.
744 for (i-=30; i>=0; i-=32){
178 496 __asm__ volatile(
179 "movq (%1), %%mm0 \n\t"
180 "movq 8(%1), %%mm2 \n\t"
181 "movq 16(%1), %%mm4 \n\t"
182 "movq 24(%1), %%mm6 \n\t"
183 "movq (%1), %%mm1 \n\t"
184 "movq 8(%1), %%mm3 \n\t"
185 "movq 16(%1), %%mm5 \n\t"
186 "movq 24(%1), %%mm7 \n\t"
187 "punpcklwd (%2), %%mm0 \n\t"
188 "punpcklwd 8(%2), %%mm2 \n\t"
189 "punpcklwd 16(%2), %%mm4 \n\t"
190 "punpcklwd 24(%2), %%mm6 \n\t"
191 "movq %%mm0, (%0) \n\t"
192 "movq %%mm2, 16(%0) \n\t"
193 "movq %%mm4, 32(%0) \n\t"
194 "movq %%mm6, 48(%0) \n\t"
195 "punpckhwd (%2), %%mm1 \n\t"
196 "punpckhwd 8(%2), %%mm3 \n\t"
197 "punpckhwd 16(%2), %%mm5 \n\t"
198 "punpckhwd 24(%2), %%mm7 \n\t"
199 "movq %%mm1, 8(%0) \n\t"
200 "movq %%mm3, 24(%0) \n\t"
201 "movq %%mm5, 40(%0) \n\t"
202 "movq %%mm7, 56(%0) \n\t"
203 496 :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
204 : "memory"
205 );
206 }
207 }
208 248 }
209
210 #if HAVE_7REGS
211 #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
212 "psubw %%"s0", %%"t0" \n\t"\
213 "psubw %%"s1", %%"t1" \n\t"\
214 "psubw %%"s2", %%"t2" \n\t"\
215 "psubw %%"s3", %%"t3" \n\t"
216
217 #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
218 "psraw $"n", %%"t0" \n\t"\
219 "psraw $"n", %%"t1" \n\t"\
220 "psraw $"n", %%"t2" \n\t"\
221 "psraw $"n", %%"t3" \n\t"
222
223 #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
224 "paddw %%"s0", %%"t0" \n\t"\
225 "paddw %%"s1", %%"t1" \n\t"\
226 "paddw %%"s2", %%"t2" \n\t"\
227 "paddw %%"s3", %%"t3" \n\t"
228
229 #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
230 "pmulhw %%"s0", %%"t0" \n\t"\
231 "pmulhw %%"s1", %%"t1" \n\t"\
232 "pmulhw %%"s2", %%"t2" \n\t"\
233 "pmulhw %%"s3", %%"t3" \n\t"
234
235 #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
236 ""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\
237 ""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\
238 ""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\
239 ""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t"
240
241 #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
242 snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
243
244 #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
245 snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
246
247 #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
248 "movq %%"s0", ("w",%%"FF_REG_d") \n\t"\
249 "movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\
250 "movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\
251 "movq %%"s3", 24("w",%%"FF_REG_d") \n\t"
252
253 #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
254 "movq %%"s0", %%"t0" \n\t"\
255 "movq %%"s1", %%"t1" \n\t"\
256 "movq %%"s2", %%"t2" \n\t"\
257 "movq %%"s3", %%"t3" \n\t"
258
259
260 96 static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
261 96 x86_reg i = width;
262
1/2
✗ Branch 0 not taken.
✓ Branch 1 taken 96 times.
96 while(i & 15)
263 {
264 i--;
265 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
266 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
267 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
268 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
269 }
270 96 i+=i;
271 96 __asm__ volatile(
272 "jmp 2f \n\t"
273 "1: \n\t"
274
275 snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
276 snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
277 "pcmpeqw %%mm0, %%mm0 \n\t"
278 "pcmpeqw %%mm2, %%mm2 \n\t"
279 "paddw %%mm2, %%mm2 \n\t"
280 "paddw %%mm0, %%mm2 \n\t"
281 "psllw $13, %%mm2 \n\t"
282 snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
283 snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
284 snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
285 snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
286 snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
287 snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
288 snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
289 snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
290 "pcmpeqw %%mm7, %%mm7 \n\t"
291 "pcmpeqw %%mm5, %%mm5 \n\t"
292 "psllw $15, %%mm7 \n\t"
293 "psrlw $13, %%mm5 \n\t"
294 "paddw %%mm7, %%mm5 \n\t"
295 snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
296 "movq (%2,%%"FF_REG_d"), %%mm1 \n\t"
297 "movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t"
298 "paddw %%mm7, %%mm1 \n\t"
299 "paddw %%mm7, %%mm3 \n\t"
300 "pavgw %%mm1, %%mm0 \n\t"
301 "pavgw %%mm3, %%mm2 \n\t"
302 "movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t"
303 "movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t"
304 "paddw %%mm7, %%mm1 \n\t"
305 "paddw %%mm7, %%mm3 \n\t"
306 "pavgw %%mm1, %%mm4 \n\t"
307 "pavgw %%mm3, %%mm6 \n\t"
308 snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
309 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
310 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
311
312 snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
313 snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
314 snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
315 snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
316 snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
317 snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
318 snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
319 snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
320 snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
321
322 "2: \n\t"
323 "sub $32, %%"FF_REG_d" \n\t"
324 "jge 1b \n\t"
325 :"+d"(i)
326 :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
327 96 }
328 #endif //HAVE_7REGS
329
330 #endif /* HAVE_INLINE_ASM */
331
332 45 av_cold void ff_dwt_init_x86(SnowDWTContext *c)
333 {
334 45 int cpuflags = av_get_cpu_flags();
335
336 #if HAVE_INLINE_ASM
337
2/2
✓ Branch 0 taken 14 times.
✓ Branch 1 taken 31 times.
45 if (INLINE_MMXEXT(cpuflags)) {
338 14 c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
339 #if HAVE_7REGS
340 14 c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
341 #endif
342 }
343 #endif /* HAVE_INLINE_ASM */
344 #if HAVE_SSSE3_EXTERNAL
345
2/2
✓ Branch 0 taken 10 times.
✓ Branch 1 taken 35 times.
45 if (EXTERNAL_SSSE3(cpuflags)) {
346 10 c->inner_add_yblock = ff_snow_inner_add_yblock_ssse3;
347 }
348 #endif
349 45 }
350