Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with FFmpeg; if not, write to the Free Software | ||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | ||
19 | */ | ||
20 | |||
21 | /** | ||
22 | * @file | ||
23 | * mmx/mmx2/sse2 postprocess code. | ||
24 | */ | ||
25 | #include "config.h" | ||
26 | |||
27 | #include "libavutil/mem_internal.h" | ||
28 | #if ARCH_X86 | ||
29 | #include "libavutil/x86/asm.h" | ||
30 | #endif | ||
31 | |||
32 | /* A single TEMPLATE_PP_* should be defined (to 1) when this template is | ||
33 | * included. The following macros will define its dependencies to 1 as well | ||
34 | * (like MMX2 depending on MMX), and will define to 0 all the others. Every | ||
35 | * TEMPLATE_PP_* need to be undef at the end. */ | ||
36 | |||
37 | #ifdef TEMPLATE_PP_C | ||
38 | # define RENAME(a) a ## _C | ||
39 | #else | ||
40 | # define TEMPLATE_PP_C 0 | ||
41 | #endif | ||
42 | |||
43 | #ifdef TEMPLATE_PP_ALTIVEC | ||
44 | # define RENAME(a) a ## _altivec | ||
45 | #else | ||
46 | # define TEMPLATE_PP_ALTIVEC 0 | ||
47 | #endif | ||
48 | |||
49 | #ifdef TEMPLATE_PP_MMX | ||
50 | # define RENAME(a) a ## _MMX | ||
51 | #else | ||
52 | # define TEMPLATE_PP_MMX 0 | ||
53 | #endif | ||
54 | |||
55 | #ifdef TEMPLATE_PP_MMXEXT | ||
56 | # undef TEMPLATE_PP_MMX | ||
57 | # define TEMPLATE_PP_MMX 1 | ||
58 | # define RENAME(a) a ## _MMX2 | ||
59 | #else | ||
60 | # define TEMPLATE_PP_MMXEXT 0 | ||
61 | #endif | ||
62 | |||
63 | #ifdef TEMPLATE_PP_SSE2 | ||
64 | # undef TEMPLATE_PP_MMX | ||
65 | # define TEMPLATE_PP_MMX 1 | ||
66 | # undef TEMPLATE_PP_MMXEXT | ||
67 | # define TEMPLATE_PP_MMXEXT 1 | ||
68 | # define RENAME(a) a ## _SSE2 | ||
69 | #else | ||
70 | # define TEMPLATE_PP_SSE2 0 | ||
71 | #endif | ||
72 | |||
73 | #undef REAL_PAVGB | ||
74 | #undef PAVGB | ||
75 | #undef PMINUB | ||
76 | #undef PMAXUB | ||
77 | |||
78 | #if TEMPLATE_PP_MMXEXT | ||
79 | #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | ||
80 | #endif | ||
81 | #define PAVGB(a,b) REAL_PAVGB(a,b) | ||
82 | |||
83 | #if TEMPLATE_PP_MMXEXT | ||
84 | #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | ||
85 | #endif | ||
86 | |||
87 | #if TEMPLATE_PP_MMXEXT | ||
88 | #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | ||
89 | #endif | ||
90 | |||
91 | //FIXME? |255-0| = 1 (should not be a problem ...) | ||
92 | #if TEMPLATE_PP_MMXEXT | ||
93 | /** | ||
94 | * Check if the middle 8x8 Block in the given 8x16 block is flat | ||
95 | */ | ||
96 | ✗ | static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){ | |
97 | ✗ | int numEq= 0, dcOk; | |
98 | ✗ | src+= stride*4; // src points to begin of the 8x8 Block | |
99 | ✗ | __asm__ volatile( | |
100 | "movq %0, %%mm7 \n\t" | ||
101 | "movq %1, %%mm6 \n\t" | ||
102 | ✗ | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
103 | ); | ||
104 | |||
105 | ✗ | __asm__ volatile( | |
106 | "lea (%2, %3), %%"FF_REG_a" \n\t" | ||
107 | // 0 1 2 3 4 5 6 7 8 9 | ||
108 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 | ||
109 | |||
110 | "movq (%2), %%mm0 \n\t" | ||
111 | "movq (%%"FF_REG_a"), %%mm1 \n\t" | ||
112 | "movq %%mm0, %%mm3 \n\t" | ||
113 | "movq %%mm0, %%mm4 \n\t" | ||
114 | PMAXUB(%%mm1, %%mm4) | ||
115 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
116 | "psubb %%mm1, %%mm0 \n\t" // mm0 = difference | ||
117 | "paddb %%mm7, %%mm0 \n\t" | ||
118 | "pcmpgtb %%mm6, %%mm0 \n\t" | ||
119 | |||
120 | "movq (%%"FF_REG_a",%3), %%mm2 \n\t" | ||
121 | PMAXUB(%%mm2, %%mm4) | ||
122 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
123 | "psubb %%mm2, %%mm1 \n\t" | ||
124 | "paddb %%mm7, %%mm1 \n\t" | ||
125 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
126 | "paddb %%mm1, %%mm0 \n\t" | ||
127 | |||
128 | "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t" | ||
129 | PMAXUB(%%mm1, %%mm4) | ||
130 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
131 | "psubb %%mm1, %%mm2 \n\t" | ||
132 | "paddb %%mm7, %%mm2 \n\t" | ||
133 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
134 | "paddb %%mm2, %%mm0 \n\t" | ||
135 | |||
136 | "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t" | ||
137 | |||
138 | "movq (%2, %3, 4), %%mm2 \n\t" | ||
139 | PMAXUB(%%mm2, %%mm4) | ||
140 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
141 | "psubb %%mm2, %%mm1 \n\t" | ||
142 | "paddb %%mm7, %%mm1 \n\t" | ||
143 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
144 | "paddb %%mm1, %%mm0 \n\t" | ||
145 | |||
146 | "movq (%%"FF_REG_a"), %%mm1 \n\t" | ||
147 | PMAXUB(%%mm1, %%mm4) | ||
148 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
149 | "psubb %%mm1, %%mm2 \n\t" | ||
150 | "paddb %%mm7, %%mm2 \n\t" | ||
151 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
152 | "paddb %%mm2, %%mm0 \n\t" | ||
153 | |||
154 | "movq (%%"FF_REG_a", %3), %%mm2 \n\t" | ||
155 | PMAXUB(%%mm2, %%mm4) | ||
156 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
157 | "psubb %%mm2, %%mm1 \n\t" | ||
158 | "paddb %%mm7, %%mm1 \n\t" | ||
159 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
160 | "paddb %%mm1, %%mm0 \n\t" | ||
161 | |||
162 | "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t" | ||
163 | PMAXUB(%%mm1, %%mm4) | ||
164 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
165 | "psubb %%mm1, %%mm2 \n\t" | ||
166 | "paddb %%mm7, %%mm2 \n\t" | ||
167 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
168 | "paddb %%mm2, %%mm0 \n\t" | ||
169 | "psubusb %%mm3, %%mm4 \n\t" | ||
170 | |||
171 | " \n\t" | ||
172 | "pxor %%mm7, %%mm7 \n\t" | ||
173 | "psadbw %%mm7, %%mm0 \n\t" | ||
174 | "movq %4, %%mm7 \n\t" // QP,..., QP | ||
175 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | ||
176 | "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 | ||
177 | "packssdw %%mm4, %%mm4 \n\t" | ||
178 | "movd %%mm0, %0 \n\t" | ||
179 | "movd %%mm4, %1 \n\t" | ||
180 | |||
181 | : "=r" (numEq), "=r" (dcOk) | ||
182 | ✗ | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) | |
183 | : "%"FF_REG_a | ||
184 | ); | ||
185 | |||
186 | ✗ | numEq= (-numEq) &0xFF; | |
187 | ✗ | if(numEq > c->ppMode.flatnessThreshold){ | |
188 | ✗ | if(dcOk) return 0; | |
189 | ✗ | else return 1; | |
190 | }else{ | ||
191 | ✗ | return 2; | |
192 | } | ||
193 | } | ||
194 | #endif //TEMPLATE_PP_MMXEXT | ||
195 | |||
196 | /** | ||
197 | * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) | ||
198 | * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | ||
199 | */ | ||
200 | #if !TEMPLATE_PP_ALTIVEC | ||
201 | 170 | static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) | |
202 | { | ||
203 | #if TEMPLATE_PP_MMXEXT | ||
204 | ✗ | src+= stride*3; | |
205 | ✗ | __asm__ volatile( //"movv %0 %1 %2\n\t" | |
206 | "movq %2, %%mm0 \n\t" // QP,..., QP | ||
207 | "pxor %%mm4, %%mm4 \n\t" | ||
208 | |||
209 | "movq (%0), %%mm6 \n\t" | ||
210 | "movq (%0, %1), %%mm5 \n\t" | ||
211 | "movq %%mm5, %%mm1 \n\t" | ||
212 | "movq %%mm6, %%mm2 \n\t" | ||
213 | "psubusb %%mm6, %%mm5 \n\t" | ||
214 | "psubusb %%mm1, %%mm2 \n\t" | ||
215 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | ||
216 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | ||
217 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF | ||
218 | |||
219 | "pand %%mm2, %%mm6 \n\t" | ||
220 | "pandn %%mm1, %%mm2 \n\t" | ||
221 | "por %%mm2, %%mm6 \n\t"// First Line to Filter | ||
222 | |||
223 | "movq (%0, %1, 8), %%mm5 \n\t" | ||
224 | "lea (%0, %1, 4), %%"FF_REG_a" \n\t" | ||
225 | "lea (%0, %1, 8), %%"FF_REG_c" \n\t" | ||
226 | "sub %1, %%"FF_REG_c" \n\t" | ||
227 | "add %1, %0 \n\t" // %0 points to line 1 not 0 | ||
228 | "movq (%0, %1, 8), %%mm7 \n\t" | ||
229 | "movq %%mm5, %%mm1 \n\t" | ||
230 | "movq %%mm7, %%mm2 \n\t" | ||
231 | "psubusb %%mm7, %%mm5 \n\t" | ||
232 | "psubusb %%mm1, %%mm2 \n\t" | ||
233 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | ||
234 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | ||
235 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF | ||
236 | |||
237 | "pand %%mm2, %%mm7 \n\t" | ||
238 | "pandn %%mm1, %%mm2 \n\t" | ||
239 | "por %%mm2, %%mm7 \n\t" // First Line to Filter | ||
240 | |||
241 | |||
242 | // 1 2 3 4 5 6 7 8 | ||
243 | // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 | ||
244 | // 6 4 2 2 1 1 | ||
245 | // 6 4 4 2 | ||
246 | // 6 8 2 | ||
247 | |||
248 | "movq (%0, %1), %%mm0 \n\t" // 1 | ||
249 | "movq %%mm0, %%mm1 \n\t" // 1 | ||
250 | PAVGB(%%mm6, %%mm0) //1 1 /2 | ||
251 | PAVGB(%%mm6, %%mm0) //3 1 /4 | ||
252 | |||
253 | "movq (%0, %1, 4), %%mm2 \n\t" // 1 | ||
254 | "movq %%mm2, %%mm5 \n\t" // 1 | ||
255 | PAVGB((%%FF_REGa), %%mm2) // 11 /2 | ||
256 | PAVGB((%0, %1, 2), %%mm2) // 211 /4 | ||
257 | "movq %%mm2, %%mm3 \n\t" // 211 /4 | ||
258 | "movq (%0), %%mm4 \n\t" // 1 | ||
259 | PAVGB(%%mm4, %%mm3) // 4 211 /8 | ||
260 | PAVGB(%%mm0, %%mm3) //642211 /16 | ||
261 | "movq %%mm3, (%0) \n\t" // X | ||
262 | // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | ||
263 | "movq %%mm1, %%mm0 \n\t" // 1 | ||
264 | PAVGB(%%mm6, %%mm0) //1 1 /2 | ||
265 | "movq %%mm4, %%mm3 \n\t" // 1 | ||
266 | PAVGB((%0,%1,2), %%mm3) // 1 1 /2 | ||
267 | PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2 | ||
268 | PAVGB((%%FF_REGa), %%mm5) // 211 /4 | ||
269 | PAVGB(%%mm5, %%mm3) // 2 2211 /8 | ||
270 | PAVGB(%%mm0, %%mm3) //4242211 /16 | ||
271 | "movq %%mm3, (%0,%1) \n\t" // X | ||
272 | // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | ||
273 | PAVGB(%%mm4, %%mm6) //11 /2 | ||
274 | "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1 | ||
275 | PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2 | ||
276 | "movq %%mm0, %%mm3 \n\t" // 11/2 | ||
277 | PAVGB(%%mm1, %%mm0) // 2 11/4 | ||
278 | PAVGB(%%mm6, %%mm0) //222 11/8 | ||
279 | PAVGB(%%mm2, %%mm0) //22242211/16 | ||
280 | "movq (%0, %1, 2), %%mm2 \n\t" // 1 | ||
281 | "movq %%mm0, (%0, %1, 2) \n\t" // X | ||
282 | // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | ||
283 | "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1 | ||
284 | PAVGB((%%FF_REGc), %%mm0) // 11 /2 | ||
285 | PAVGB(%%mm0, %%mm6) //11 11 /4 | ||
286 | PAVGB(%%mm1, %%mm4) // 11 /2 | ||
287 | PAVGB(%%mm2, %%mm1) // 11 /2 | ||
288 | PAVGB(%%mm1, %%mm6) //1122 11 /8 | ||
289 | PAVGB(%%mm5, %%mm6) //112242211 /16 | ||
290 | "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1 | ||
291 | "movq %%mm6, (%%"FF_REG_a") \n\t" // X | ||
292 | // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | ||
293 | "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1 | ||
294 | PAVGB(%%mm7, %%mm6) // 11 /2 | ||
295 | PAVGB(%%mm4, %%mm6) // 11 11 /4 | ||
296 | PAVGB(%%mm3, %%mm6) // 11 2211 /8 | ||
297 | PAVGB(%%mm5, %%mm2) // 11 /2 | ||
298 | "movq (%0, %1, 4), %%mm4 \n\t" // 1 | ||
299 | PAVGB(%%mm4, %%mm2) // 112 /4 | ||
300 | PAVGB(%%mm2, %%mm6) // 112242211 /16 | ||
301 | "movq %%mm6, (%0, %1, 4) \n\t" // X | ||
302 | // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | ||
303 | PAVGB(%%mm7, %%mm1) // 11 2 /4 | ||
304 | PAVGB(%%mm4, %%mm5) // 11 /2 | ||
305 | PAVGB(%%mm5, %%mm0) // 11 11 /4 | ||
306 | "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1 | ||
307 | PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | ||
308 | PAVGB(%%mm0, %%mm1) // 11224222 /16 | ||
309 | "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X | ||
310 | // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | ||
311 | PAVGB((%%FF_REGc), %%mm2) // 112 4 /8 | ||
312 | "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1 | ||
313 | PAVGB(%%mm0, %%mm6) // 1 1 /2 | ||
314 | PAVGB(%%mm7, %%mm6) // 1 12 /4 | ||
315 | PAVGB(%%mm2, %%mm6) // 1122424 /4 | ||
316 | "movq %%mm6, (%%"FF_REG_c") \n\t" // X | ||
317 | // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | ||
318 | PAVGB(%%mm7, %%mm5) // 11 2 /4 | ||
319 | PAVGB(%%mm7, %%mm5) // 11 6 /8 | ||
320 | |||
321 | PAVGB(%%mm3, %%mm0) // 112 /4 | ||
322 | PAVGB(%%mm0, %%mm5) // 112246 /16 | ||
323 | "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X | ||
324 | "sub %1, %0 \n\t" | ||
325 | |||
326 | : | ||
327 | ✗ | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) | |
328 | : "%"FF_REG_a, "%"FF_REG_c | ||
329 | ); | ||
330 | #else //TEMPLATE_PP_MMXEXT | ||
331 | 170 | const int l1= stride; | |
332 | 170 | const int l2= stride + l1; | |
333 | 170 | const int l3= stride + l2; | |
334 | 170 | const int l4= stride + l3; | |
335 | 170 | const int l5= stride + l4; | |
336 | 170 | const int l6= stride + l5; | |
337 | 170 | const int l7= stride + l6; | |
338 | 170 | const int l8= stride + l7; | |
339 | 170 | const int l9= stride + l8; | |
340 | int x; | ||
341 | 170 | src+= stride*3; | |
342 |
2/2✓ Branch 0 taken 680 times.
✓ Branch 1 taken 85 times.
|
1530 | for(x=0; x<BLOCK_SIZE; x++){ |
343 |
2/2✓ Branch 0 taken 617 times.
✓ Branch 1 taken 63 times.
|
1360 | const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
344 |
2/2✓ Branch 0 taken 587 times.
✓ Branch 1 taken 93 times.
|
1360 | const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; |
345 | |||
346 | int sums[10]; | ||
347 | 1360 | sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; | |
348 | 1360 | sums[1] = sums[0] - first + src[l4]; | |
349 | 1360 | sums[2] = sums[1] - first + src[l5]; | |
350 | 1360 | sums[3] = sums[2] - first + src[l6]; | |
351 | 1360 | sums[4] = sums[3] - first + src[l7]; | |
352 | 1360 | sums[5] = sums[4] - src[l1] + src[l8]; | |
353 | 1360 | sums[6] = sums[5] - src[l2] + last; | |
354 | 1360 | sums[7] = sums[6] - src[l3] + last; | |
355 | 1360 | sums[8] = sums[7] - src[l4] + last; | |
356 | 1360 | sums[9] = sums[8] - src[l5] + last; | |
357 | |||
358 | 1360 | src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; | |
359 | 1360 | src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; | |
360 | 1360 | src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; | |
361 | 1360 | src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; | |
362 | 1360 | src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; | |
363 | 1360 | src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; | |
364 | 1360 | src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | |
365 | 1360 | src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | |
366 | |||
367 | 1360 | src++; | |
368 | } | ||
369 | #endif //TEMPLATE_PP_MMXEXT | ||
370 | 170 | } | |
371 | #endif //TEMPLATE_PP_ALTIVEC | ||
372 | |||
373 | /** | ||
374 | * Experimental Filter 1 | ||
375 | * will not damage linear gradients | ||
376 | * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | ||
377 | * can only smooth blocks at the expected locations (it cannot smooth them if they did move) | ||
378 | * MMX2 version does correct clipping C version does not | ||
379 | */ | ||
380 | 22880 | static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) | |
381 | { | ||
382 | #if TEMPLATE_PP_MMXEXT | ||
383 | ✗ | src+= stride*3; | |
384 | |||
385 | ✗ | __asm__ volatile( | |
386 | "pxor %%mm7, %%mm7 \n\t" // 0 | ||
387 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
388 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t" | ||
389 | // 0 1 2 3 4 5 6 7 8 9 | ||
390 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 | ||
391 | "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3 | ||
392 | "movq (%0, %1, 4), %%mm1 \n\t" // line 4 | ||
393 | "movq %%mm1, %%mm2 \n\t" // line 4 | ||
394 | "psubusb %%mm0, %%mm1 \n\t" | ||
395 | "psubusb %%mm2, %%mm0 \n\t" | ||
396 | "por %%mm1, %%mm0 \n\t" // |l2 - l3| | ||
397 | "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5 | ||
398 | "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6 | ||
399 | "movq %%mm3, %%mm5 \n\t" // line 5 | ||
400 | "psubusb %%mm4, %%mm3 \n\t" | ||
401 | "psubusb %%mm5, %%mm4 \n\t" | ||
402 | "por %%mm4, %%mm3 \n\t" // |l5 - l6| | ||
403 | PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 | ||
404 | "movq %%mm2, %%mm1 \n\t" // line 4 | ||
405 | "psubusb %%mm5, %%mm2 \n\t" | ||
406 | "movq %%mm2, %%mm4 \n\t" | ||
407 | "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | ||
408 | "psubusb %%mm1, %%mm5 \n\t" | ||
409 | "por %%mm5, %%mm4 \n\t" // |l4 - l5| | ||
410 | "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | ||
411 | "movq %%mm4, %%mm3 \n\t" // d | ||
412 | "movq %2, %%mm0 \n\t" | ||
413 | "paddusb %%mm0, %%mm0 \n\t" | ||
414 | "psubusb %%mm0, %%mm4 \n\t" | ||
415 | "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | ||
416 | "psubusb "MANGLE(b01)", %%mm3 \n\t" | ||
417 | "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | ||
418 | |||
419 | PAVGB(%%mm7, %%mm3) // d/2 | ||
420 | "movq %%mm3, %%mm1 \n\t" // d/2 | ||
421 | PAVGB(%%mm7, %%mm3) // d/4 | ||
422 | PAVGB(%%mm1, %%mm3) // 3*d/8 | ||
423 | |||
424 | "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | ||
425 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | ||
426 | "psubusb %%mm3, %%mm0 \n\t" | ||
427 | "pxor %%mm2, %%mm0 \n\t" | ||
428 | "movq %%mm0, (%0, %1, 4) \n\t" // line 4 | ||
429 | |||
430 | "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5 | ||
431 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | ||
432 | "paddusb %%mm3, %%mm0 \n\t" | ||
433 | "pxor %%mm2, %%mm0 \n\t" | ||
434 | "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5 | ||
435 | |||
436 | PAVGB(%%mm7, %%mm1) // d/4 | ||
437 | |||
438 | "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3 | ||
439 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | ||
440 | "psubusb %%mm1, %%mm0 \n\t" | ||
441 | "pxor %%mm2, %%mm0 \n\t" | ||
442 | "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3 | ||
443 | |||
444 | "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6 | ||
445 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | ||
446 | "paddusb %%mm1, %%mm0 \n\t" | ||
447 | "pxor %%mm2, %%mm0 \n\t" | ||
448 | "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6 | ||
449 | |||
450 | PAVGB(%%mm7, %%mm1) // d/8 | ||
451 | |||
452 | "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2 | ||
453 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | ||
454 | "psubusb %%mm1, %%mm0 \n\t" | ||
455 | "pxor %%mm2, %%mm0 \n\t" | ||
456 | "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2 | ||
457 | |||
458 | "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7 | ||
459 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | ||
460 | "paddusb %%mm1, %%mm0 \n\t" | ||
461 | "pxor %%mm2, %%mm0 \n\t" | ||
462 | "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7 | ||
463 | |||
464 | : | ||
465 | ✗ | : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) | |
466 | NAMED_CONSTRAINTS_ADD(b01) | ||
467 | : "%"FF_REG_a, "%"FF_REG_c | ||
468 | ); | ||
469 | #else //TEMPLATE_PP_MMXEXT | ||
470 | |||
471 | 22880 | const int l1= stride; | |
472 | 22880 | const int l2= stride + l1; | |
473 | 22880 | const int l3= stride + l2; | |
474 | 22880 | const int l4= stride + l3; | |
475 | 22880 | const int l5= stride + l4; | |
476 | 22880 | const int l6= stride + l5; | |
477 | 22880 | const int l7= stride + l6; | |
478 | // const int l8= stride + l7; | ||
479 | // const int l9= stride + l8; | ||
480 | int x; | ||
481 | |||
482 | 22880 | src+= stride*3; | |
483 |
2/2✓ Branch 0 taken 91520 times.
✓ Branch 1 taken 11440 times.
|
205920 | for(x=0; x<BLOCK_SIZE; x++){ |
484 | 183040 | int a= src[l3] - src[l4]; | |
485 | 183040 | int b= src[l4] - src[l5]; | |
486 | 183040 | int c= src[l5] - src[l6]; | |
487 | |||
488 | 183040 | int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); | |
489 | 183040 | d= FFMAX(d, 0); | |
490 | |||
491 |
2/2✓ Branch 0 taken 89036 times.
✓ Branch 1 taken 2484 times.
|
183040 | if(d < co->QP*2){ |
492 |
2/2✓ Branch 0 taken 54624 times.
✓ Branch 1 taken 34412 times.
|
178072 | int v = d * FFSIGN(-b); |
493 | |||
494 | 178072 | src[l2] +=v>>3; | |
495 | 178072 | src[l3] +=v>>2; | |
496 | 178072 | src[l4] +=(3*v)>>3; | |
497 | 178072 | src[l5] -=(3*v)>>3; | |
498 | 178072 | src[l6] -=v>>2; | |
499 | 178072 | src[l7] -=v>>3; | |
500 | } | ||
501 | 183040 | src++; | |
502 | } | ||
503 | #endif //TEMPLATE_PP_MMXEXT | ||
504 | 22880 | } | |
505 | |||
506 | #if !TEMPLATE_PP_ALTIVEC | ||
507 | 67936 | static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) | |
508 | { | ||
509 | #if TEMPLATE_PP_MMXEXT | ||
510 | /* | ||
511 | uint8_t tmp[16]; | ||
512 | const int l1= stride; | ||
513 | const int l2= stride + l1; | ||
514 | const int l3= stride + l2; | ||
515 | const int l4= (int)tmp - (int)src - stride*3; | ||
516 | const int l5= (int)tmp - (int)src - stride*3 + 8; | ||
517 | const int l6= stride*3 + l3; | ||
518 | const int l7= stride + l6; | ||
519 | const int l8= stride + l7; | ||
520 | |||
521 | memcpy(tmp, src+stride*7, 8); | ||
522 | memcpy(tmp+8, src+stride*8, 8); | ||
523 | */ | ||
524 | ✗ | src+= stride*4; | |
525 | ✗ | __asm__ volatile( | |
526 | |||
527 | #if 0 //slightly more accurate and slightly slower | ||
528 | "pxor %%mm7, %%mm7 \n\t" // 0 | ||
529 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
530 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t" | ||
531 | // 0 1 2 3 4 5 6 7 | ||
532 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 | ||
533 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | ||
534 | |||
535 | |||
536 | "movq (%0, %1, 2), %%mm0 \n\t" // l2 | ||
537 | "movq (%0), %%mm1 \n\t" // l0 | ||
538 | "movq %%mm0, %%mm2 \n\t" // l2 | ||
539 | PAVGB(%%mm7, %%mm0) // ~l2/2 | ||
540 | PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | ||
541 | PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | ||
542 | |||
543 | "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1 | ||
544 | "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3 | ||
545 | "movq %%mm1, %%mm4 \n\t" // l1 | ||
546 | PAVGB(%%mm7, %%mm1) // ~l1/2 | ||
547 | PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | ||
548 | PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | ||
549 | |||
550 | "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | ||
551 | "psubusb %%mm1, %%mm0 \n\t" | ||
552 | "psubusb %%mm4, %%mm1 \n\t" | ||
553 | "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | ||
554 | // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | ||
555 | |||
556 | "movq (%0, %1, 4), %%mm0 \n\t" // l4 | ||
557 | "movq %%mm0, %%mm4 \n\t" // l4 | ||
558 | PAVGB(%%mm7, %%mm0) // ~l4/2 | ||
559 | PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | ||
560 | PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | ||
561 | |||
562 | "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5 | ||
563 | "movq %%mm3, %%mm5 \n\t" // l3 | ||
564 | PAVGB(%%mm7, %%mm3) // ~l3/2 | ||
565 | PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | ||
566 | PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | ||
567 | |||
568 | "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | ||
569 | "psubusb %%mm3, %%mm0 \n\t" | ||
570 | "psubusb %%mm6, %%mm3 \n\t" | ||
571 | "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | ||
572 | "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | ||
573 | // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | ||
574 | |||
575 | "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6 | ||
576 | "movq %%mm6, %%mm5 \n\t" // l6 | ||
577 | PAVGB(%%mm7, %%mm6) // ~l6/2 | ||
578 | PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | ||
579 | PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | ||
580 | |||
581 | "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7 | ||
582 | "movq %%mm2, %%mm4 \n\t" // l5 | ||
583 | PAVGB(%%mm7, %%mm2) // ~l5/2 | ||
584 | PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | ||
585 | PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | ||
586 | |||
587 | "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | ||
588 | "psubusb %%mm2, %%mm6 \n\t" | ||
589 | "psubusb %%mm4, %%mm2 \n\t" | ||
590 | "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | ||
591 | // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | ||
592 | |||
593 | |||
594 | PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | ||
595 | "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? | ||
596 | "paddusb "MANGLE(b01)", %%mm4 \n\t" | ||
597 | "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | ||
598 | "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | ||
599 | "pand %%mm4, %%mm3 \n\t" | ||
600 | |||
601 | "movq %%mm3, %%mm1 \n\t" | ||
602 | // "psubusb "MANGLE(b01)", %%mm3 \n\t" | ||
603 | PAVGB(%%mm7, %%mm3) | ||
604 | PAVGB(%%mm7, %%mm3) | ||
605 | "paddusb %%mm1, %%mm3 \n\t" | ||
606 | // "paddusb "MANGLE(b01)", %%mm3 \n\t" | ||
607 | |||
608 | "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3 | ||
609 | "movq (%0, %1, 4), %%mm5 \n\t" //l4 | ||
610 | "movq (%0, %1, 4), %%mm4 \n\t" //l4 | ||
611 | "psubusb %%mm6, %%mm5 \n\t" | ||
612 | "psubusb %%mm4, %%mm6 \n\t" | ||
613 | "por %%mm6, %%mm5 \n\t" // |l3-l4| | ||
614 | "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | ||
615 | "pxor %%mm6, %%mm0 \n\t" | ||
616 | "pand %%mm0, %%mm3 \n\t" | ||
617 | PMINUB(%%mm5, %%mm3, %%mm0) | ||
618 | |||
619 | "psubusb "MANGLE(b01)", %%mm3 \n\t" | ||
620 | PAVGB(%%mm7, %%mm3) | ||
621 | |||
622 | "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" | ||
623 | "movq (%0, %1, 4), %%mm2 \n\t" | ||
624 | "pxor %%mm6, %%mm0 \n\t" | ||
625 | "pxor %%mm6, %%mm2 \n\t" | ||
626 | "psubb %%mm3, %%mm0 \n\t" | ||
627 | "paddb %%mm3, %%mm2 \n\t" | ||
628 | "pxor %%mm6, %%mm0 \n\t" | ||
629 | "pxor %%mm6, %%mm2 \n\t" | ||
630 | "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" | ||
631 | "movq %%mm2, (%0, %1, 4) \n\t" | ||
632 | #endif //0 | ||
633 | |||
634 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
635 | "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | ||
636 | // 0 1 2 3 4 5 6 7 | ||
637 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 | ||
638 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | ||
639 | |||
640 | |||
641 | "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3 | ||
642 | "movq (%0, %1, 4), %%mm0 \n\t" // l4 | ||
643 | "pxor %%mm6, %%mm1 \n\t" // -l3-1 | ||
644 | PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | ||
645 | // mm1=-l3-1, mm0=128-q | ||
646 | |||
647 | "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5 | ||
648 | "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2 | ||
649 | "pxor %%mm6, %%mm2 \n\t" // -l5-1 | ||
650 | "movq %%mm2, %%mm5 \n\t" // -l5-1 | ||
651 | "movq "MANGLE(b80)", %%mm4 \n\t" // 128 | ||
652 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t" | ||
653 | PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | ||
654 | PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | ||
655 | PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | ||
656 | PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | ||
657 | // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | ||
658 | |||
659 | "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1 | ||
660 | "pxor %%mm6, %%mm2 \n\t" // -l1-1 | ||
661 | PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | ||
662 | PAVGB((%0), %%mm1) // (l0-l3+256)/2 | ||
663 | "movq "MANGLE(b80)", %%mm3 \n\t" // 128 | ||
664 | PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | ||
665 | PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | ||
666 | PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | ||
667 | // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | ||
668 | |||
669 | PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2 | ||
670 | "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7 | ||
671 | "pxor %%mm6, %%mm1 \n\t" // -l7-1 | ||
672 | PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | ||
673 | "movq "MANGLE(b80)", %%mm2 \n\t" // 128 | ||
674 | PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | ||
675 | PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | ||
676 | PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | ||
677 | // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | ||
678 | |||
679 | "movq "MANGLE(b00)", %%mm1 \n\t" // 0 | ||
680 | "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | ||
681 | "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | ||
682 | "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | ||
683 | PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | ||
684 | PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | ||
685 | PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | ||
686 | |||
687 | // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | ||
688 | |||
689 | "movq "MANGLE(b00)", %%mm7 \n\t" // 0 | ||
690 | "movq %2, %%mm2 \n\t" // QP | ||
691 | PAVGB(%%mm6, %%mm2) // 128 + QP/2 | ||
692 | "psubb %%mm6, %%mm2 \n\t" | ||
693 | |||
694 | "movq %%mm4, %%mm1 \n\t" | ||
695 | "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | ||
696 | "pxor %%mm1, %%mm4 \n\t" | ||
697 | "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | ||
698 | "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | ||
699 | "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | ||
700 | // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | ||
701 | |||
702 | "movq %%mm4, %%mm3 \n\t" // d | ||
703 | "psubusb "MANGLE(b01)", %%mm4 \n\t" | ||
704 | PAVGB(%%mm7, %%mm4) // d/32 | ||
705 | PAVGB(%%mm7, %%mm4) // (d + 32)/64 | ||
706 | "paddb %%mm3, %%mm4 \n\t" // 5d/64 | ||
707 | "pand %%mm2, %%mm4 \n\t" | ||
708 | |||
709 | "movq "MANGLE(b80)", %%mm5 \n\t" // 128 | ||
710 | "psubb %%mm0, %%mm5 \n\t" // q | ||
711 | "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | ||
712 | "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | ||
713 | "pxor %%mm7, %%mm5 \n\t" | ||
714 | |||
715 | PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | ||
716 | "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | ||
717 | |||
718 | "pand %%mm7, %%mm4 \n\t" | ||
719 | "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" | ||
720 | "movq (%0, %1, 4), %%mm2 \n\t" | ||
721 | "pxor %%mm1, %%mm0 \n\t" | ||
722 | "pxor %%mm1, %%mm2 \n\t" | ||
723 | "paddb %%mm4, %%mm0 \n\t" | ||
724 | "psubb %%mm4, %%mm2 \n\t" | ||
725 | "pxor %%mm1, %%mm0 \n\t" | ||
726 | "pxor %%mm1, %%mm2 \n\t" | ||
727 | "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" | ||
728 | "movq %%mm2, (%0, %1, 4) \n\t" | ||
729 | |||
730 | : | ||
731 | ✗ | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) | |
732 | NAMED_CONSTRAINTS_ADD(b80,b00,b01) | ||
733 | : "%"FF_REG_a, "%"FF_REG_c | ||
734 | ); | ||
735 | |||
736 | /* | ||
737 | { | ||
738 | int x; | ||
739 | src-= stride; | ||
740 | for(x=0; x<BLOCK_SIZE; x++){ | ||
741 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | ||
742 | if(FFABS(middleEnergy)< 8*QP){ | ||
743 | const int q=(src[l4] - src[l5])/2; | ||
744 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | ||
745 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | ||
746 | |||
747 | int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); | ||
748 | d= FFMAX(d, 0); | ||
749 | |||
750 | d= (5*d + 32) >> 6; | ||
751 | d*= FFSIGN(-middleEnergy); | ||
752 | |||
753 | if(q>0){ | ||
754 | d= d<0 ? 0 : d; | ||
755 | d= d>q ? q : d; | ||
756 | }else{ | ||
757 | d= d>0 ? 0 : d; | ||
758 | d= d<q ? q : d; | ||
759 | } | ||
760 | |||
761 | src[l4]-= d; | ||
762 | src[l5]+= d; | ||
763 | } | ||
764 | src++; | ||
765 | } | ||
766 | src-=8; | ||
767 | for(x=0; x<8; x++){ | ||
768 | int y; | ||
769 | for(y=4; y<6; y++){ | ||
770 | int d= src[x+y*stride] - tmp[x+(y-4)*8]; | ||
771 | int ad= FFABS(d); | ||
772 | static int max=0; | ||
773 | static int sum=0; | ||
774 | static int num=0; | ||
775 | static int bias=0; | ||
776 | |||
777 | if(max<ad) max=ad; | ||
778 | sum+= ad>3 ? 1 : 0; | ||
779 | if(ad>3){ | ||
780 | src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | ||
781 | } | ||
782 | if(y==4) bias+=d; | ||
783 | num++; | ||
784 | if(num%1000000 == 0){ | ||
785 | av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias); | ||
786 | } | ||
787 | } | ||
788 | } | ||
789 | } | ||
790 | */ | ||
791 | #else //TEMPLATE_PP_MMXEXT | ||
792 | 67936 | const int l1= stride; | |
793 | 67936 | const int l2= stride + l1; | |
794 | 67936 | const int l3= stride + l2; | |
795 | 67936 | const int l4= stride + l3; | |
796 | 67936 | const int l5= stride + l4; | |
797 | 67936 | const int l6= stride + l5; | |
798 | 67936 | const int l7= stride + l6; | |
799 | 67936 | const int l8= stride + l7; | |
800 | // const int l9= stride + l8; | ||
801 | int x; | ||
802 | 67936 | src+= stride*3; | |
803 |
2/2✓ Branch 0 taken 271744 times.
✓ Branch 1 taken 33968 times.
|
611424 | for(x=0; x<BLOCK_SIZE; x++){ |
804 | 543488 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
805 |
2/2✓ Branch 0 taken 188125 times.
✓ Branch 1 taken 83619 times.
|
543488 | if(FFABS(middleEnergy) < 8*c->QP){ |
806 | 376250 | const int q=(src[l4] - src[l5])/2; | |
807 | 376250 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
808 | 376250 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
809 | |||
810 | 376250 | int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); | |
811 | 376250 | d= FFMAX(d, 0); | |
812 | |||
813 | 376250 | d= (5*d + 32) >> 6; | |
814 |
2/2✓ Branch 0 taken 126802 times.
✓ Branch 1 taken 61323 times.
|
376250 | d*= FFSIGN(-middleEnergy); |
815 | |||
816 |
2/2✓ Branch 0 taken 49601 times.
✓ Branch 1 taken 138524 times.
|
376250 | if(q>0){ |
817 | 99202 | d = FFMAX(d, 0); | |
818 | 99202 | d = FFMIN(d, q); | |
819 | }else{ | ||
820 | 277048 | d = FFMIN(d, 0); | |
821 | 277048 | d = FFMAX(d, q); | |
822 | } | ||
823 | |||
824 | 376250 | src[l4]-= d; | |
825 | 376250 | src[l5]+= d; | |
826 | } | ||
827 | 543488 | src++; | |
828 | } | ||
829 | #endif //TEMPLATE_PP_MMXEXT | ||
830 | 67936 | } | |
831 | #endif //TEMPLATE_PP_ALTIVEC | ||
832 | |||
833 | #if !TEMPLATE_PP_ALTIVEC | ||
834 | ✗ | static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) | |
835 | { | ||
836 | #if TEMPLATE_PP_MMXEXT && HAVE_7REGS | ||
837 | DECLARE_ALIGNED(8, uint64_t, tmp)[3]; | ||
838 | ✗ | __asm__ volatile( | |
839 | "pxor %%mm6, %%mm6 \n\t" | ||
840 | "pcmpeqb %%mm7, %%mm7 \n\t" | ||
841 | "movq %2, %%mm0 \n\t" | ||
842 | "punpcklbw %%mm6, %%mm0 \n\t" | ||
843 | "psrlw $1, %%mm0 \n\t" | ||
844 | "psubw %%mm7, %%mm0 \n\t" | ||
845 | "packuswb %%mm0, %%mm0 \n\t" | ||
846 | "movq %%mm0, %3 \n\t" | ||
847 | |||
848 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
849 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
850 | |||
851 | // 0 1 2 3 4 5 6 7 8 9 | ||
852 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | ||
853 | |||
854 | #undef REAL_FIND_MIN_MAX | ||
855 | #undef FIND_MIN_MAX | ||
856 | #define REAL_FIND_MIN_MAX(addr)\ | ||
857 | "movq " #addr ", %%mm0 \n\t"\ | ||
858 | "pminub %%mm0, %%mm7 \n\t"\ | ||
859 | "pmaxub %%mm0, %%mm6 \n\t" | ||
860 | #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) | ||
861 | |||
862 | FIND_MIN_MAX((%%FF_REGa)) | ||
863 | FIND_MIN_MAX((%%FF_REGa, %1)) | ||
864 | FIND_MIN_MAX((%%FF_REGa, %1, 2)) | ||
865 | FIND_MIN_MAX((%0, %1, 4)) | ||
866 | FIND_MIN_MAX((%%FF_REGd)) | ||
867 | FIND_MIN_MAX((%%FF_REGd, %1)) | ||
868 | FIND_MIN_MAX((%%FF_REGd, %1, 2)) | ||
869 | FIND_MIN_MAX((%0, %1, 8)) | ||
870 | |||
871 | "movq %%mm7, %%mm4 \n\t" | ||
872 | "psrlq $8, %%mm7 \n\t" | ||
873 | "pminub %%mm4, %%mm7 \n\t" // min of pixels | ||
874 | "pshufw $0xF9, %%mm7, %%mm4 \n\t" | ||
875 | "pminub %%mm4, %%mm7 \n\t" // min of pixels | ||
876 | "pshufw $0xFE, %%mm7, %%mm4 \n\t" | ||
877 | "pminub %%mm4, %%mm7 \n\t" | ||
878 | |||
879 | |||
880 | "movq %%mm6, %%mm4 \n\t" | ||
881 | "psrlq $8, %%mm6 \n\t" | ||
882 | "pmaxub %%mm4, %%mm6 \n\t" // max of pixels | ||
883 | "pshufw $0xF9, %%mm6, %%mm4 \n\t" | ||
884 | "pmaxub %%mm4, %%mm6 \n\t" | ||
885 | "pshufw $0xFE, %%mm6, %%mm4 \n\t" | ||
886 | "pmaxub %%mm4, %%mm6 \n\t" | ||
887 | "movq %%mm6, %%mm0 \n\t" // max | ||
888 | "psubb %%mm7, %%mm6 \n\t" // max - min | ||
889 | "push %%"FF_REG_a" \n\t" | ||
890 | "movd %%mm6, %%eax \n\t" | ||
891 | "cmpb "MANGLE(deringThreshold)", %%al \n\t" | ||
892 | "pop %%"FF_REG_a" \n\t" | ||
893 | " jb 1f \n\t" | ||
894 | PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | ||
895 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
896 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
897 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
898 | "movq %%mm7, (%4) \n\t" | ||
899 | |||
900 | "movq (%0), %%mm0 \n\t" // L10 | ||
901 | "movq %%mm0, %%mm1 \n\t" // L10 | ||
902 | "movq %%mm0, %%mm2 \n\t" // L10 | ||
903 | "psllq $8, %%mm1 \n\t" | ||
904 | "psrlq $8, %%mm2 \n\t" | ||
905 | "movd -4(%0), %%mm3 \n\t" | ||
906 | "movd 8(%0), %%mm4 \n\t" | ||
907 | "psrlq $24, %%mm3 \n\t" | ||
908 | "psllq $56, %%mm4 \n\t" | ||
909 | "por %%mm3, %%mm1 \n\t" // L00 | ||
910 | "por %%mm4, %%mm2 \n\t" // L20 | ||
911 | "movq %%mm1, %%mm3 \n\t" // L00 | ||
912 | PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | ||
913 | PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | ||
914 | "psubusb %%mm7, %%mm0 \n\t" | ||
915 | "psubusb %%mm7, %%mm2 \n\t" | ||
916 | "psubusb %%mm7, %%mm3 \n\t" | ||
917 | "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 | ||
918 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | ||
919 | "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | ||
920 | "paddb %%mm2, %%mm0 \n\t" | ||
921 | "paddb %%mm3, %%mm0 \n\t" | ||
922 | |||
923 | "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11 | ||
924 | "movq %%mm2, %%mm3 \n\t" // L11 | ||
925 | "movq %%mm2, %%mm4 \n\t" // L11 | ||
926 | "psllq $8, %%mm3 \n\t" | ||
927 | "psrlq $8, %%mm4 \n\t" | ||
928 | "movd -4(%%"FF_REG_a"), %%mm5 \n\t" | ||
929 | "movd 8(%%"FF_REG_a"), %%mm6 \n\t" | ||
930 | "psrlq $24, %%mm5 \n\t" | ||
931 | "psllq $56, %%mm6 \n\t" | ||
932 | "por %%mm5, %%mm3 \n\t" // L01 | ||
933 | "por %%mm6, %%mm4 \n\t" // L21 | ||
934 | "movq %%mm3, %%mm5 \n\t" // L01 | ||
935 | PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | ||
936 | PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | ||
937 | "psubusb %%mm7, %%mm2 \n\t" | ||
938 | "psubusb %%mm7, %%mm4 \n\t" | ||
939 | "psubusb %%mm7, %%mm5 \n\t" | ||
940 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 | ||
941 | "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | ||
942 | "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | ||
943 | "paddb %%mm4, %%mm2 \n\t" | ||
944 | "paddb %%mm5, %%mm2 \n\t" | ||
945 | // 0, 2, 3, 1 | ||
946 | #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | ||
947 | "movq " #src ", " #sx " \n\t" /* src[0] */\ | ||
948 | "movq " #sx ", " #lx " \n\t" /* src[0] */\ | ||
949 | "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | ||
950 | "psllq $8, " #lx " \n\t"\ | ||
951 | "psrlq $8, " #t0 " \n\t"\ | ||
952 | "movd -4" #src ", " #t1 " \n\t"\ | ||
953 | "psrlq $24, " #t1 " \n\t"\ | ||
954 | "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | ||
955 | "movd 8" #src ", " #t1 " \n\t"\ | ||
956 | "psllq $56, " #t1 " \n\t"\ | ||
957 | "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | ||
958 | "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | ||
959 | PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | ||
960 | PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | ||
961 | PAVGB(lx, pplx) \ | ||
962 | "movq " #lx ", 8(%4) \n\t"\ | ||
963 | "movq (%4), " #lx " \n\t"\ | ||
964 | "psubusb " #lx ", " #t1 " \n\t"\ | ||
965 | "psubusb " #lx ", " #t0 " \n\t"\ | ||
966 | "psubusb " #lx ", " #sx " \n\t"\ | ||
967 | "movq "MANGLE(b00)", " #lx " \n\t"\ | ||
968 | "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ | ||
969 | "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | ||
970 | "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | ||
971 | "paddb " #t1 ", " #t0 " \n\t"\ | ||
972 | "paddb " #t0 ", " #sx " \n\t"\ | ||
973 | \ | ||
974 | PAVGB(plx, pplx) /* filtered */\ | ||
975 | "movq " #dst ", " #t0 " \n\t" /* dst */\ | ||
976 | "movq " #t0 ", " #t1 " \n\t" /* dst */\ | ||
977 | "psubusb %3, " #t0 " \n\t"\ | ||
978 | "paddusb %3, " #t1 " \n\t"\ | ||
979 | PMAXUB(t0, pplx)\ | ||
980 | PMINUB(t1, pplx, t0)\ | ||
981 | "paddb " #sx ", " #ppsx " \n\t"\ | ||
982 | "paddb " #psx ", " #ppsx " \n\t"\ | ||
983 | "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ | ||
984 | "pand "MANGLE(b08)", " #ppsx " \n\t"\ | ||
985 | "pcmpeqb " #lx ", " #ppsx " \n\t"\ | ||
986 | "pand " #ppsx ", " #pplx " \n\t"\ | ||
987 | "pandn " #dst ", " #ppsx " \n\t"\ | ||
988 | "por " #pplx ", " #ppsx " \n\t"\ | ||
989 | "movq " #ppsx ", " #dst " \n\t"\ | ||
990 | "movq 8(%4), " #lx " \n\t" | ||
991 | |||
992 | #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | ||
993 | REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) | ||
994 | /* | ||
995 | 0000000 | ||
996 | 1111111 | ||
997 | |||
998 | 1111110 | ||
999 | 1111101 | ||
1000 | 1111100 | ||
1001 | 1111011 | ||
1002 | 1111010 | ||
1003 | 1111001 | ||
1004 | |||
1005 | 1111000 | ||
1006 | 1110111 | ||
1007 | |||
1008 | */ | ||
1009 | //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | ||
1010 | DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | ||
1011 | DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | ||
1012 | DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | ||
1013 | DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | ||
1014 | DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | ||
1015 | DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | ||
1016 | DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | ||
1017 | DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | ||
1018 | |||
1019 | "1: \n\t" | ||
1020 | ✗ | : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp) | |
1021 | NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08) | ||
1022 | : "%"FF_REG_a, "%"FF_REG_d | ||
1023 | ); | ||
1024 | #else // HAVE_7REGS && TEMPLATE_PP_MMXEXT | ||
1025 | int y; | ||
1026 | ✗ | int min=255; | |
1027 | ✗ | int max=0; | |
1028 | int avg; | ||
1029 | uint8_t *p; | ||
1030 | int s[10]; | ||
1031 | ✗ | const int QP2= c->QP/2 + 1; | |
1032 | |||
1033 | ✗ | src --; | |
1034 | ✗ | for(y=1; y<9; y++){ | |
1035 | int x; | ||
1036 | ✗ | p= src + stride*y; | |
1037 | ✗ | for(x=1; x<9; x++){ | |
1038 | ✗ | p++; | |
1039 | ✗ | if(*p > max) max= *p; | |
1040 | ✗ | if(*p < min) min= *p; | |
1041 | } | ||
1042 | } | ||
1043 | ✗ | avg= (min + max + 1)>>1; | |
1044 | |||
1045 | ✗ | if(max - min <deringThreshold) return; | |
1046 | |||
1047 | ✗ | for(y=0; y<10; y++){ | |
1048 | ✗ | int t = 0; | |
1049 | |||
1050 | ✗ | if(src[stride*y + 0] > avg) t+= 1; | |
1051 | ✗ | if(src[stride*y + 1] > avg) t+= 2; | |
1052 | ✗ | if(src[stride*y + 2] > avg) t+= 4; | |
1053 | ✗ | if(src[stride*y + 3] > avg) t+= 8; | |
1054 | ✗ | if(src[stride*y + 4] > avg) t+= 16; | |
1055 | ✗ | if(src[stride*y + 5] > avg) t+= 32; | |
1056 | ✗ | if(src[stride*y + 6] > avg) t+= 64; | |
1057 | ✗ | if(src[stride*y + 7] > avg) t+= 128; | |
1058 | ✗ | if(src[stride*y + 8] > avg) t+= 256; | |
1059 | ✗ | if(src[stride*y + 9] > avg) t+= 512; | |
1060 | |||
1061 | ✗ | t |= (~t)<<16; | |
1062 | ✗ | t &= (t<<1) & (t>>1); | |
1063 | ✗ | s[y] = t; | |
1064 | } | ||
1065 | |||
1066 | ✗ | for(y=1; y<9; y++){ | |
1067 | ✗ | int t = s[y-1] & s[y] & s[y+1]; | |
1068 | ✗ | t|= t>>16; | |
1069 | ✗ | s[y-1]= t; | |
1070 | } | ||
1071 | |||
1072 | ✗ | for(y=1; y<9; y++){ | |
1073 | int x; | ||
1074 | ✗ | int t = s[y-1]; | |
1075 | |||
1076 | ✗ | p= src + stride*y; | |
1077 | ✗ | for(x=1; x<9; x++){ | |
1078 | ✗ | p++; | |
1079 | ✗ | if(t & (1<<x)){ | |
1080 | ✗ | int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
1081 | ✗ | +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
1082 | ✗ | +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
1083 | ✗ | f= (f + 8)>>4; | |
1084 | |||
1085 | #ifdef DEBUG_DERING_THRESHOLD | ||
1086 | __asm__ volatile("emms\n\t":); | ||
1087 | { | ||
1088 | static uint64_t numPixels=0; | ||
1089 | if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | ||
1090 | // if((max-min)<20 || (max-min)*QP<200) | ||
1091 | // if((max-min)*QP < 500) | ||
1092 | // if(max-min<QP/2) | ||
1093 | if(max-min < 20){ | ||
1094 | static int numSkipped=0; | ||
1095 | static int errorSum=0; | ||
1096 | static int worstQP=0; | ||
1097 | static int worstRange=0; | ||
1098 | static int worstDiff=0; | ||
1099 | int diff= (f - *p); | ||
1100 | int absDiff= FFABS(diff); | ||
1101 | int error= diff*diff; | ||
1102 | |||
1103 | if(x==1 || x==8 || y==1 || y==8) continue; | ||
1104 | |||
1105 | numSkipped++; | ||
1106 | if(absDiff > worstDiff){ | ||
1107 | worstDiff= absDiff; | ||
1108 | worstQP= QP; | ||
1109 | worstRange= max-min; | ||
1110 | } | ||
1111 | errorSum+= error; | ||
1112 | |||
1113 | if(1024LL*1024LL*1024LL % numSkipped == 0){ | ||
1114 | av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, " | ||
1115 | "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | ||
1116 | (float)errorSum/numSkipped, numSkipped, worstQP, worstRange, | ||
1117 | worstDiff, (float)numSkipped/numPixels); | ||
1118 | } | ||
1119 | } | ||
1120 | } | ||
1121 | #endif | ||
1122 | ✗ | if (*p + QP2 < f) *p= *p + QP2; | |
1123 | ✗ | else if(*p - QP2 > f) *p= *p - QP2; | |
1124 | ✗ | else *p=f; | |
1125 | } | ||
1126 | } | ||
1127 | } | ||
1128 | #ifdef DEBUG_DERING_THRESHOLD | ||
1129 | if(max-min < 20){ | ||
1130 | for(y=1; y<9; y++){ | ||
1131 | int x; | ||
1132 | int t = 0; | ||
1133 | p= src + stride*y; | ||
1134 | for(x=1; x<9; x++){ | ||
1135 | p++; | ||
1136 | *p = FFMIN(*p + 20, 255); | ||
1137 | } | ||
1138 | } | ||
1139 | // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | ||
1140 | } | ||
1141 | #endif | ||
1142 | #endif //TEMPLATE_PP_MMXEXT | ||
1143 | ✗ | } | |
1144 | #endif //TEMPLATE_PP_ALTIVEC | ||
1145 | |||
1146 | /** | ||
1147 | * Deinterlace the given block by linearly interpolating every second line. | ||
1148 | * will be called for every 8x8 block and can read & write from line 4-15 | ||
1149 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1150 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1151 | */ | ||
1152 | 24640 | static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) | |
1153 | { | ||
1154 | #if TEMPLATE_PP_MMXEXT | ||
1155 | ✗ | src+= 4*stride; | |
1156 | ✗ | __asm__ volatile( | |
1157 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1158 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t" | ||
1159 | // 0 1 2 3 4 5 6 7 8 9 | ||
1160 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 | ||
1161 | |||
1162 | "movq (%0), %%mm0 \n\t" | ||
1163 | "movq (%%"FF_REG_a", %1), %%mm1 \n\t" | ||
1164 | PAVGB(%%mm1, %%mm0) | ||
1165 | "movq %%mm0, (%%"FF_REG_a") \n\t" | ||
1166 | "movq (%0, %1, 4), %%mm0 \n\t" | ||
1167 | PAVGB(%%mm0, %%mm1) | ||
1168 | "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" | ||
1169 | "movq (%%"FF_REG_c", %1), %%mm1 \n\t" | ||
1170 | PAVGB(%%mm1, %%mm0) | ||
1171 | "movq %%mm0, (%%"FF_REG_c") \n\t" | ||
1172 | "movq (%0, %1, 8), %%mm0 \n\t" | ||
1173 | PAVGB(%%mm0, %%mm1) | ||
1174 | "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t" | ||
1175 | |||
1176 | ✗ | : : "r" (src), "r" ((x86_reg)stride) | |
1177 | : "%"FF_REG_a, "%"FF_REG_c | ||
1178 | ); | ||
1179 | #else | ||
1180 | int a, b, x; | ||
1181 | 24640 | src+= 4*stride; | |
1182 | |||
1183 |
2/2✓ Branch 0 taken 24640 times.
✓ Branch 1 taken 12320 times.
|
73920 | for(x=0; x<2; x++){ |
1184 | 49280 | a= *(uint32_t*)&src[stride*0]; | |
1185 | 49280 | b= *(uint32_t*)&src[stride*2]; | |
1186 | 49280 | *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1187 | 49280 | a= *(uint32_t*)&src[stride*4]; | |
1188 | 49280 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1189 | 49280 | b= *(uint32_t*)&src[stride*6]; | |
1190 | 49280 | *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1191 | 49280 | a= *(uint32_t*)&src[stride*8]; | |
1192 | 49280 | *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1193 | 49280 | src += 4; | |
1194 | } | ||
1195 | #endif | ||
1196 | 24640 | } | |
1197 | |||
1198 | /** | ||
1199 | * Deinterlace the given block by cubic interpolating every second line. | ||
1200 | * will be called for every 8x8 block and can read & write from line 4-15 | ||
1201 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1202 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1203 | * this filter will read lines 3-15 and write 7-13 | ||
1204 | */ | ||
1205 | 24640 | static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) | |
1206 | { | ||
1207 | #if TEMPLATE_PP_SSE2 | ||
1208 | ✗ | src+= stride*3; | |
1209 | ✗ | __asm__ volatile( | |
1210 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1211 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
1212 | "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t" | ||
1213 | "add %1, %%"FF_REG_c" \n\t" | ||
1214 | "pxor %%xmm7, %%xmm7 \n\t" | ||
1215 | #define REAL_DEINT_CUBIC(a,b,c,d,e)\ | ||
1216 | "movq " #a ", %%xmm0 \n\t"\ | ||
1217 | "movq " #b ", %%xmm1 \n\t"\ | ||
1218 | "movq " #d ", %%xmm2 \n\t"\ | ||
1219 | "movq " #e ", %%xmm3 \n\t"\ | ||
1220 | "pavgb %%xmm2, %%xmm1 \n\t"\ | ||
1221 | "pavgb %%xmm3, %%xmm0 \n\t"\ | ||
1222 | "punpcklbw %%xmm7, %%xmm0 \n\t"\ | ||
1223 | "punpcklbw %%xmm7, %%xmm1 \n\t"\ | ||
1224 | "psubw %%xmm1, %%xmm0 \n\t"\ | ||
1225 | "psraw $3, %%xmm0 \n\t"\ | ||
1226 | "psubw %%xmm0, %%xmm1 \n\t"\ | ||
1227 | "packuswb %%xmm1, %%xmm1 \n\t"\ | ||
1228 | "movlps %%xmm1, " #c " \n\t" | ||
1229 | #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) | ||
1230 | |||
1231 | DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1)) | ||
1232 | DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8)) | ||
1233 | DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc)) | ||
1234 | DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2)) | ||
1235 | |||
1236 | ✗ | : : "r" (src), "r" ((x86_reg)stride) | |
1237 | : | ||
1238 | XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",) | ||
1239 | "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c | ||
1240 | ); | ||
1241 | #undef REAL_DEINT_CUBIC | ||
1242 | #else //TEMPLATE_PP_SSE2 | ||
1243 | int x; | ||
1244 | 24640 | src+= stride*3; | |
1245 |
2/2✓ Branch 0 taken 98560 times.
✓ Branch 1 taken 12320 times.
|
221760 | for(x=0; x<8; x++){ |
1246 | 197120 | src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); | |
1247 | 197120 | src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
1248 | 197120 | src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
1249 | 197120 | src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
1250 | 197120 | src++; | |
1251 | } | ||
1252 | #endif //TEMPLATE_PP_SSE2 | ||
1253 | 24640 | } | |
1254 | |||
1255 | /** | ||
1256 | * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter. | ||
1257 | * will be called for every 8x8 block and can read & write from line 4-15 | ||
1258 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1259 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1260 | * this filter will read lines 4-13 and write 5-11 | ||
1261 | */ | ||
1262 | 24640 | static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
1263 | { | ||
1264 | #if TEMPLATE_PP_MMXEXT | ||
1265 | ✗ | src+= stride*4; | |
1266 | ✗ | __asm__ volatile( | |
1267 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1268 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
1269 | "pxor %%mm7, %%mm7 \n\t" | ||
1270 | "movq (%2), %%mm0 \n\t" | ||
1271 | // 0 1 2 3 4 5 6 7 8 9 10 | ||
1272 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | ||
1273 | |||
1274 | #define REAL_DEINT_FF(a,b,c,d)\ | ||
1275 | "movq " #a ", %%mm1 \n\t"\ | ||
1276 | "movq " #b ", %%mm2 \n\t"\ | ||
1277 | "movq " #c ", %%mm3 \n\t"\ | ||
1278 | "movq " #d ", %%mm4 \n\t"\ | ||
1279 | PAVGB(%%mm3, %%mm1) \ | ||
1280 | PAVGB(%%mm4, %%mm0) \ | ||
1281 | "movq %%mm0, %%mm3 \n\t"\ | ||
1282 | "punpcklbw %%mm7, %%mm0 \n\t"\ | ||
1283 | "punpckhbw %%mm7, %%mm3 \n\t"\ | ||
1284 | "movq %%mm1, %%mm4 \n\t"\ | ||
1285 | "punpcklbw %%mm7, %%mm1 \n\t"\ | ||
1286 | "punpckhbw %%mm7, %%mm4 \n\t"\ | ||
1287 | "psllw $2, %%mm1 \n\t"\ | ||
1288 | "psllw $2, %%mm4 \n\t"\ | ||
1289 | "psubw %%mm0, %%mm1 \n\t"\ | ||
1290 | "psubw %%mm3, %%mm4 \n\t"\ | ||
1291 | "movq %%mm2, %%mm5 \n\t"\ | ||
1292 | "movq %%mm2, %%mm0 \n\t"\ | ||
1293 | "punpcklbw %%mm7, %%mm2 \n\t"\ | ||
1294 | "punpckhbw %%mm7, %%mm5 \n\t"\ | ||
1295 | "paddw %%mm2, %%mm1 \n\t"\ | ||
1296 | "paddw %%mm5, %%mm4 \n\t"\ | ||
1297 | "psraw $2, %%mm1 \n\t"\ | ||
1298 | "psraw $2, %%mm4 \n\t"\ | ||
1299 | "packuswb %%mm4, %%mm1 \n\t"\ | ||
1300 | "movq %%mm1, " #b " \n\t"\ | ||
1301 | |||
1302 | #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) | ||
1303 | |||
1304 | DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2)) | ||
1305 | DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) ) | ||
1306 | DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2)) | ||
1307 | DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4)) | ||
1308 | |||
1309 | "movq %%mm0, (%2) \n\t" | ||
1310 | ✗ | : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) | |
1311 | : "%"FF_REG_a, "%"FF_REG_d | ||
1312 | ); | ||
1313 | #else //TEMPLATE_PP_MMXEXT | ||
1314 | int x; | ||
1315 | 24640 | src+= stride*4; | |
1316 |
2/2✓ Branch 0 taken 98560 times.
✓ Branch 1 taken 12320 times.
|
221760 | for(x=0; x<8; x++){ |
1317 | 197120 | int t1= tmp[x]; | |
1318 | 197120 | int t2= src[stride*1]; | |
1319 | |||
1320 | 197120 | src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); | |
1321 | 197120 | t1= src[stride*4]; | |
1322 | 197120 | src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); | |
1323 | 197120 | t2= src[stride*6]; | |
1324 | 197120 | src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); | |
1325 | 197120 | t1= src[stride*8]; | |
1326 | 197120 | src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); | |
1327 | 197120 | tmp[x]= t1; | |
1328 | |||
1329 | 197120 | src++; | |
1330 | } | ||
1331 | #endif //TEMPLATE_PP_MMXEXT | ||
1332 | 24640 | } | |
1333 | |||
1334 | /** | ||
1335 | * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter. | ||
1336 | * will be called for every 8x8 block and can read & write from line 4-15 | ||
1337 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1338 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1339 | * this filter will read lines 4-13 and write 4-11 | ||
1340 | */ | ||
1341 | 73920 | static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
1342 | { | ||
1343 | #if TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
1344 | ✗ | src+= stride*4; | |
1345 | ✗ | __asm__ volatile( | |
1346 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1347 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
1348 | "pxor %%mm7, %%mm7 \n\t" | ||
1349 | "movq (%2), %%mm0 \n\t" | ||
1350 | "movq (%3), %%mm1 \n\t" | ||
1351 | // 0 1 2 3 4 5 6 7 8 9 10 | ||
1352 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | ||
1353 | |||
1354 | #define REAL_DEINT_L5(t1,t2,a,b,c)\ | ||
1355 | "movq " #a ", %%mm2 \n\t"\ | ||
1356 | "movq " #b ", %%mm3 \n\t"\ | ||
1357 | "movq " #c ", %%mm4 \n\t"\ | ||
1358 | PAVGB(t2, %%mm3) \ | ||
1359 | PAVGB(t1, %%mm4) \ | ||
1360 | "movq %%mm2, %%mm5 \n\t"\ | ||
1361 | "movq %%mm2, " #t1 " \n\t"\ | ||
1362 | "punpcklbw %%mm7, %%mm2 \n\t"\ | ||
1363 | "punpckhbw %%mm7, %%mm5 \n\t"\ | ||
1364 | "movq %%mm2, %%mm6 \n\t"\ | ||
1365 | "paddw %%mm2, %%mm2 \n\t"\ | ||
1366 | "paddw %%mm6, %%mm2 \n\t"\ | ||
1367 | "movq %%mm5, %%mm6 \n\t"\ | ||
1368 | "paddw %%mm5, %%mm5 \n\t"\ | ||
1369 | "paddw %%mm6, %%mm5 \n\t"\ | ||
1370 | "movq %%mm3, %%mm6 \n\t"\ | ||
1371 | "punpcklbw %%mm7, %%mm3 \n\t"\ | ||
1372 | "punpckhbw %%mm7, %%mm6 \n\t"\ | ||
1373 | "paddw %%mm3, %%mm3 \n\t"\ | ||
1374 | "paddw %%mm6, %%mm6 \n\t"\ | ||
1375 | "paddw %%mm3, %%mm2 \n\t"\ | ||
1376 | "paddw %%mm6, %%mm5 \n\t"\ | ||
1377 | "movq %%mm4, %%mm6 \n\t"\ | ||
1378 | "punpcklbw %%mm7, %%mm4 \n\t"\ | ||
1379 | "punpckhbw %%mm7, %%mm6 \n\t"\ | ||
1380 | "psubw %%mm4, %%mm2 \n\t"\ | ||
1381 | "psubw %%mm6, %%mm5 \n\t"\ | ||
1382 | "psraw $2, %%mm2 \n\t"\ | ||
1383 | "psraw $2, %%mm5 \n\t"\ | ||
1384 | "packuswb %%mm5, %%mm2 \n\t"\ | ||
1385 | "movq %%mm2, " #a " \n\t"\ | ||
1386 | |||
1387 | #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) | ||
1388 | |||
1389 | DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) ) | ||
1390 | DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2)) | ||
1391 | DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) ) | ||
1392 | DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) ) | ||
1393 | DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) ) | ||
1394 | DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2)) | ||
1395 | DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) ) | ||
1396 | DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4)) | ||
1397 | |||
1398 | "movq %%mm0, (%2) \n\t" | ||
1399 | "movq %%mm1, (%3) \n\t" | ||
1400 | ✗ | : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) | |
1401 | : "%"FF_REG_a, "%"FF_REG_d | ||
1402 | ); | ||
1403 | #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
1404 | int x; | ||
1405 | 73920 | src+= stride*4; | |
1406 |
2/2✓ Branch 0 taken 295680 times.
✓ Branch 1 taken 36960 times.
|
665280 | for(x=0; x<8; x++){ |
1407 | 591360 | int t1= tmp[x]; | |
1408 | 591360 | int t2= tmp2[x]; | |
1409 | 591360 | int t3= src[0]; | |
1410 | |||
1411 | 591360 | src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
1412 | 591360 | t1= src[stride*1]; | |
1413 | 591360 | src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
1414 | 591360 | t2= src[stride*2]; | |
1415 | 591360 | src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
1416 | 591360 | t3= src[stride*3]; | |
1417 | 591360 | src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
1418 | 591360 | t1= src[stride*4]; | |
1419 | 591360 | src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
1420 | 591360 | t2= src[stride*5]; | |
1421 | 591360 | src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
1422 | 591360 | t3= src[stride*6]; | |
1423 | 591360 | src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
1424 | 591360 | t1= src[stride*7]; | |
1425 | 591360 | src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
1426 | |||
1427 | 591360 | tmp[x]= t3; | |
1428 | 591360 | tmp2[x]= t1; | |
1429 | |||
1430 | 591360 | src++; | |
1431 | } | ||
1432 | #endif // TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
1433 | 73920 | } | |
1434 | |||
1435 | /** | ||
1436 | * Deinterlace the given block by filtering all lines with a (1 2 1) filter. | ||
1437 | * will be called for every 8x8 block and can read & write from line 4-15 | ||
1438 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1439 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1440 | * this filter will read lines 4-13 and write 4-11 | ||
1441 | */ | ||
1442 | 24640 | static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) | |
1443 | { | ||
1444 | #if TEMPLATE_PP_MMXEXT | ||
1445 | ✗ | src+= 4*stride; | |
1446 | ✗ | __asm__ volatile( | |
1447 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1448 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
1449 | // 0 1 2 3 4 5 6 7 8 9 | ||
1450 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | ||
1451 | |||
1452 | "movq (%2), %%mm0 \n\t" // L0 | ||
1453 | "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2 | ||
1454 | PAVGB(%%mm1, %%mm0) // L0+L2 | ||
1455 | "movq (%0), %%mm2 \n\t" // L1 | ||
1456 | PAVGB(%%mm2, %%mm0) | ||
1457 | "movq %%mm0, (%0) \n\t" | ||
1458 | "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3 | ||
1459 | PAVGB(%%mm0, %%mm2) // L1+L3 | ||
1460 | PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | ||
1461 | "movq %%mm2, (%%"FF_REG_a") \n\t" | ||
1462 | "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4 | ||
1463 | PAVGB(%%mm2, %%mm1) // L2+L4 | ||
1464 | PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | ||
1465 | "movq %%mm1, (%%"FF_REG_a", %1) \n\t" | ||
1466 | "movq (%0, %1, 4), %%mm1 \n\t" // L5 | ||
1467 | PAVGB(%%mm1, %%mm0) // L3+L5 | ||
1468 | PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | ||
1469 | "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" | ||
1470 | "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6 | ||
1471 | PAVGB(%%mm0, %%mm2) // L4+L6 | ||
1472 | PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | ||
1473 | "movq %%mm2, (%0, %1, 4) \n\t" | ||
1474 | "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7 | ||
1475 | PAVGB(%%mm2, %%mm1) // L5+L7 | ||
1476 | PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | ||
1477 | "movq %%mm1, (%%"FF_REG_d") \n\t" | ||
1478 | "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8 | ||
1479 | PAVGB(%%mm1, %%mm0) // L6+L8 | ||
1480 | PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | ||
1481 | "movq %%mm0, (%%"FF_REG_d", %1) \n\t" | ||
1482 | "movq (%0, %1, 8), %%mm0 \n\t" // L9 | ||
1483 | PAVGB(%%mm0, %%mm2) // L7+L9 | ||
1484 | PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | ||
1485 | "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t" | ||
1486 | "movq %%mm1, (%2) \n\t" | ||
1487 | |||
1488 | ✗ | : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) | |
1489 | : "%"FF_REG_a, "%"FF_REG_d | ||
1490 | ); | ||
1491 | #else //TEMPLATE_PP_MMXEXT | ||
1492 | int a, b, c, x; | ||
1493 | 24640 | src+= 4*stride; | |
1494 | |||
1495 |
2/2✓ Branch 0 taken 24640 times.
✓ Branch 1 taken 12320 times.
|
73920 | for(x=0; x<2; x++){ |
1496 | 49280 | a= *(uint32_t*)&tmp[stride*0]; | |
1497 | 49280 | b= *(uint32_t*)&src[stride*0]; | |
1498 | 49280 | c= *(uint32_t*)&src[stride*1]; | |
1499 | 49280 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1500 | 49280 | *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1501 | |||
1502 | 49280 | a= *(uint32_t*)&src[stride*2]; | |
1503 | 49280 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1504 | 49280 | *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1505 | |||
1506 | 49280 | b= *(uint32_t*)&src[stride*3]; | |
1507 | 49280 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); | |
1508 | 49280 | *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); | |
1509 | |||
1510 | 49280 | c= *(uint32_t*)&src[stride*4]; | |
1511 | 49280 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1512 | 49280 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1513 | |||
1514 | 49280 | a= *(uint32_t*)&src[stride*5]; | |
1515 | 49280 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1516 | 49280 | *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1517 | |||
1518 | 49280 | b= *(uint32_t*)&src[stride*6]; | |
1519 | 49280 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); | |
1520 | 49280 | *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); | |
1521 | |||
1522 | 49280 | c= *(uint32_t*)&src[stride*7]; | |
1523 | 49280 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); | |
1524 | 49280 | *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); | |
1525 | |||
1526 | 49280 | a= *(uint32_t*)&src[stride*8]; | |
1527 | 49280 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); | |
1528 | 49280 | *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); | |
1529 | |||
1530 | 49280 | *(uint32_t*)&tmp[stride*0]= c; | |
1531 | 49280 | src += 4; | |
1532 | 49280 | tmp += 4; | |
1533 | } | ||
1534 | #endif //TEMPLATE_PP_MMXEXT | ||
1535 | 24640 | } | |
1536 | |||
1537 | /** | ||
1538 | * Deinterlace the given block by applying a median filter to every second line. | ||
1539 | * will be called for every 8x8 block and can read & write from line 4-15, | ||
1540 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | ||
1541 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced | ||
1542 | */ | ||
1543 | 24640 | static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) | |
1544 | { | ||
1545 | #if TEMPLATE_PP_MMXEXT | ||
1546 | ✗ | src+= 4*stride; | |
1547 | ✗ | __asm__ volatile( | |
1548 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1549 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t" | ||
1550 | // 0 1 2 3 4 5 6 7 8 9 | ||
1551 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | ||
1552 | |||
1553 | "movq (%0), %%mm0 \n\t" | ||
1554 | "movq (%%"FF_REG_a", %1), %%mm2 \n\t" | ||
1555 | "movq (%%"FF_REG_a"), %%mm1 \n\t" | ||
1556 | "movq %%mm0, %%mm3 \n\t" | ||
1557 | "pmaxub %%mm1, %%mm0 \n\t" | ||
1558 | "pminub %%mm3, %%mm1 \n\t" | ||
1559 | "pmaxub %%mm2, %%mm1 \n\t" | ||
1560 | "pminub %%mm1, %%mm0 \n\t" | ||
1561 | "movq %%mm0, (%%"FF_REG_a") \n\t" | ||
1562 | |||
1563 | "movq (%0, %1, 4), %%mm0 \n\t" | ||
1564 | "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" | ||
1565 | "movq %%mm2, %%mm3 \n\t" | ||
1566 | "pmaxub %%mm1, %%mm2 \n\t" | ||
1567 | "pminub %%mm3, %%mm1 \n\t" | ||
1568 | "pmaxub %%mm0, %%mm1 \n\t" | ||
1569 | "pminub %%mm1, %%mm2 \n\t" | ||
1570 | "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t" | ||
1571 | |||
1572 | "movq (%%"FF_REG_d"), %%mm2 \n\t" | ||
1573 | "movq (%%"FF_REG_d", %1), %%mm1 \n\t" | ||
1574 | "movq %%mm2, %%mm3 \n\t" | ||
1575 | "pmaxub %%mm0, %%mm2 \n\t" | ||
1576 | "pminub %%mm3, %%mm0 \n\t" | ||
1577 | "pmaxub %%mm1, %%mm0 \n\t" | ||
1578 | "pminub %%mm0, %%mm2 \n\t" | ||
1579 | "movq %%mm2, (%%"FF_REG_d") \n\t" | ||
1580 | |||
1581 | "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t" | ||
1582 | "movq (%0, %1, 8), %%mm0 \n\t" | ||
1583 | "movq %%mm2, %%mm3 \n\t" | ||
1584 | "pmaxub %%mm0, %%mm2 \n\t" | ||
1585 | "pminub %%mm3, %%mm0 \n\t" | ||
1586 | "pmaxub %%mm1, %%mm0 \n\t" | ||
1587 | "pminub %%mm0, %%mm2 \n\t" | ||
1588 | "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t" | ||
1589 | |||
1590 | |||
1591 | ✗ | : : "r" (src), "r" ((x86_reg)stride) | |
1592 | : "%"FF_REG_a, "%"FF_REG_d | ||
1593 | ); | ||
1594 | |||
1595 | #else //TEMPLATE_PP_MMX | ||
1596 | int x, y; | ||
1597 | 24640 | src+= 4*stride; | |
1598 | // FIXME - there should be a way to do a few columns in parallel like w/mmx | ||
1599 |
2/2✓ Branch 0 taken 98560 times.
✓ Branch 1 taken 12320 times.
|
221760 | for(x=0; x<8; x++){ |
1600 | 197120 | uint8_t *colsrc = src; | |
1601 |
2/2✓ Branch 0 taken 394240 times.
✓ Branch 1 taken 98560 times.
|
985600 | for (y=0; y<4; y++){ |
1602 | int a, b, c, d, e, f; | ||
1603 | 788480 | a = colsrc[0 ]; | |
1604 | 788480 | b = colsrc[stride ]; | |
1605 | 788480 | c = colsrc[stride*2]; | |
1606 | 788480 | d = (a-b)>>31; | |
1607 | 788480 | e = (b-c)>>31; | |
1608 | 788480 | f = (c-a)>>31; | |
1609 | 788480 | colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); | |
1610 | 788480 | colsrc += stride*2; | |
1611 | } | ||
1612 | 197120 | src++; | |
1613 | } | ||
1614 | #endif //TEMPLATE_PP_MMX | ||
1615 | 24640 | } | |
1616 | |||
1617 | #if TEMPLATE_PP_MMX | ||
1618 | /** | ||
1619 | * Transpose and shift the given 8x8 Block into dst1 and dst2. | ||
1620 | */ | ||
1621 | ✗ | static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride) | |
1622 | { | ||
1623 | ✗ | __asm__( | |
1624 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1625 | // 0 1 2 3 4 5 6 7 8 9 | ||
1626 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | ||
1627 | "movq (%0), %%mm0 \n\t" // 12345678 | ||
1628 | "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh | ||
1629 | "movq %%mm0, %%mm2 \n\t" // 12345678 | ||
1630 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | ||
1631 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | ||
1632 | |||
1633 | "movq (%%"FF_REG_a", %1), %%mm1 \n\t" | ||
1634 | "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" | ||
1635 | "movq %%mm1, %%mm4 \n\t" | ||
1636 | "punpcklbw %%mm3, %%mm1 \n\t" | ||
1637 | "punpckhbw %%mm3, %%mm4 \n\t" | ||
1638 | |||
1639 | "movq %%mm0, %%mm3 \n\t" | ||
1640 | "punpcklwd %%mm1, %%mm0 \n\t" | ||
1641 | "punpckhwd %%mm1, %%mm3 \n\t" | ||
1642 | "movq %%mm2, %%mm1 \n\t" | ||
1643 | "punpcklwd %%mm4, %%mm2 \n\t" | ||
1644 | "punpckhwd %%mm4, %%mm1 \n\t" | ||
1645 | |||
1646 | "movd %%mm0, 128(%2) \n\t" | ||
1647 | "psrlq $32, %%mm0 \n\t" | ||
1648 | "movd %%mm0, 144(%2) \n\t" | ||
1649 | "movd %%mm3, 160(%2) \n\t" | ||
1650 | "psrlq $32, %%mm3 \n\t" | ||
1651 | "movd %%mm3, 176(%2) \n\t" | ||
1652 | "movd %%mm3, 48(%3) \n\t" | ||
1653 | "movd %%mm2, 192(%2) \n\t" | ||
1654 | "movd %%mm2, 64(%3) \n\t" | ||
1655 | "psrlq $32, %%mm2 \n\t" | ||
1656 | "movd %%mm2, 80(%3) \n\t" | ||
1657 | "movd %%mm1, 96(%3) \n\t" | ||
1658 | "psrlq $32, %%mm1 \n\t" | ||
1659 | "movd %%mm1, 112(%3) \n\t" | ||
1660 | |||
1661 | "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t" | ||
1662 | |||
1663 | "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | ||
1664 | "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh | ||
1665 | "movq %%mm0, %%mm2 \n\t" // 12345678 | ||
1666 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | ||
1667 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | ||
1668 | |||
1669 | "movq (%%"FF_REG_a", %1), %%mm1 \n\t" | ||
1670 | "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" | ||
1671 | "movq %%mm1, %%mm4 \n\t" | ||
1672 | "punpcklbw %%mm3, %%mm1 \n\t" | ||
1673 | "punpckhbw %%mm3, %%mm4 \n\t" | ||
1674 | |||
1675 | "movq %%mm0, %%mm3 \n\t" | ||
1676 | "punpcklwd %%mm1, %%mm0 \n\t" | ||
1677 | "punpckhwd %%mm1, %%mm3 \n\t" | ||
1678 | "movq %%mm2, %%mm1 \n\t" | ||
1679 | "punpcklwd %%mm4, %%mm2 \n\t" | ||
1680 | "punpckhwd %%mm4, %%mm1 \n\t" | ||
1681 | |||
1682 | "movd %%mm0, 132(%2) \n\t" | ||
1683 | "psrlq $32, %%mm0 \n\t" | ||
1684 | "movd %%mm0, 148(%2) \n\t" | ||
1685 | "movd %%mm3, 164(%2) \n\t" | ||
1686 | "psrlq $32, %%mm3 \n\t" | ||
1687 | "movd %%mm3, 180(%2) \n\t" | ||
1688 | "movd %%mm3, 52(%3) \n\t" | ||
1689 | "movd %%mm2, 196(%2) \n\t" | ||
1690 | "movd %%mm2, 68(%3) \n\t" | ||
1691 | "psrlq $32, %%mm2 \n\t" | ||
1692 | "movd %%mm2, 84(%3) \n\t" | ||
1693 | "movd %%mm1, 100(%3) \n\t" | ||
1694 | "psrlq $32, %%mm1 \n\t" | ||
1695 | "movd %%mm1, 116(%3) \n\t" | ||
1696 | |||
1697 | |||
1698 | ✗ | :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2) | |
1699 | : "%"FF_REG_a | ||
1700 | ); | ||
1701 | ✗ | } | |
1702 | |||
1703 | /** | ||
1704 | * Transpose the given 8x8 block. | ||
1705 | */ | ||
1706 | ✗ | static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src) | |
1707 | { | ||
1708 | ✗ | __asm__( | |
1709 | "lea (%0, %1), %%"FF_REG_a" \n\t" | ||
1710 | "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t" | ||
1711 | // 0 1 2 3 4 5 6 7 8 9 | ||
1712 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | ||
1713 | "movq (%2), %%mm0 \n\t" // 12345678 | ||
1714 | "movq 16(%2), %%mm1 \n\t" // abcdefgh | ||
1715 | "movq %%mm0, %%mm2 \n\t" // 12345678 | ||
1716 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | ||
1717 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | ||
1718 | |||
1719 | "movq 32(%2), %%mm1 \n\t" | ||
1720 | "movq 48(%2), %%mm3 \n\t" | ||
1721 | "movq %%mm1, %%mm4 \n\t" | ||
1722 | "punpcklbw %%mm3, %%mm1 \n\t" | ||
1723 | "punpckhbw %%mm3, %%mm4 \n\t" | ||
1724 | |||
1725 | "movq %%mm0, %%mm3 \n\t" | ||
1726 | "punpcklwd %%mm1, %%mm0 \n\t" | ||
1727 | "punpckhwd %%mm1, %%mm3 \n\t" | ||
1728 | "movq %%mm2, %%mm1 \n\t" | ||
1729 | "punpcklwd %%mm4, %%mm2 \n\t" | ||
1730 | "punpckhwd %%mm4, %%mm1 \n\t" | ||
1731 | |||
1732 | "movd %%mm0, (%0) \n\t" | ||
1733 | "psrlq $32, %%mm0 \n\t" | ||
1734 | "movd %%mm0, (%%"FF_REG_a") \n\t" | ||
1735 | "movd %%mm3, (%%"FF_REG_a", %1) \n\t" | ||
1736 | "psrlq $32, %%mm3 \n\t" | ||
1737 | "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t" | ||
1738 | "movd %%mm2, (%0, %1, 4) \n\t" | ||
1739 | "psrlq $32, %%mm2 \n\t" | ||
1740 | "movd %%mm2, (%%"FF_REG_d") \n\t" | ||
1741 | "movd %%mm1, (%%"FF_REG_d", %1) \n\t" | ||
1742 | "psrlq $32, %%mm1 \n\t" | ||
1743 | "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t" | ||
1744 | |||
1745 | |||
1746 | "movq 64(%2), %%mm0 \n\t" // 12345678 | ||
1747 | "movq 80(%2), %%mm1 \n\t" // abcdefgh | ||
1748 | "movq %%mm0, %%mm2 \n\t" // 12345678 | ||
1749 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | ||
1750 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | ||
1751 | |||
1752 | "movq 96(%2), %%mm1 \n\t" | ||
1753 | "movq 112(%2), %%mm3 \n\t" | ||
1754 | "movq %%mm1, %%mm4 \n\t" | ||
1755 | "punpcklbw %%mm3, %%mm1 \n\t" | ||
1756 | "punpckhbw %%mm3, %%mm4 \n\t" | ||
1757 | |||
1758 | "movq %%mm0, %%mm3 \n\t" | ||
1759 | "punpcklwd %%mm1, %%mm0 \n\t" | ||
1760 | "punpckhwd %%mm1, %%mm3 \n\t" | ||
1761 | "movq %%mm2, %%mm1 \n\t" | ||
1762 | "punpcklwd %%mm4, %%mm2 \n\t" | ||
1763 | "punpckhwd %%mm4, %%mm1 \n\t" | ||
1764 | |||
1765 | "movd %%mm0, 4(%0) \n\t" | ||
1766 | "psrlq $32, %%mm0 \n\t" | ||
1767 | "movd %%mm0, 4(%%"FF_REG_a") \n\t" | ||
1768 | "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t" | ||
1769 | "psrlq $32, %%mm3 \n\t" | ||
1770 | "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t" | ||
1771 | "movd %%mm2, 4(%0, %1, 4) \n\t" | ||
1772 | "psrlq $32, %%mm2 \n\t" | ||
1773 | "movd %%mm2, 4(%%"FF_REG_d") \n\t" | ||
1774 | "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t" | ||
1775 | "psrlq $32, %%mm1 \n\t" | ||
1776 | "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t" | ||
1777 | |||
1778 | ✗ | :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src) | |
1779 | : "%"FF_REG_a, "%"FF_REG_d | ||
1780 | ); | ||
1781 | ✗ | } | |
1782 | #endif //TEMPLATE_PP_MMX | ||
1783 | //static long test=0; | ||
1784 | |||
1785 | #if !TEMPLATE_PP_ALTIVEC | ||
1786 | 71280 | static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | |
1787 | uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise) | ||
1788 | { | ||
1789 | // to save a register (FIXME do this outside of the loops) | ||
1790 | 71280 | tempBlurredPast[127]= maxNoise[0]; | |
1791 | 71280 | tempBlurredPast[128]= maxNoise[1]; | |
1792 | 71280 | tempBlurredPast[129]= maxNoise[2]; | |
1793 | |||
1794 | #define FAST_L2_DIFF | ||
1795 | //#define L1_DIFF //u should change the thresholds too if u try that one | ||
1796 | #if TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
1797 | ✗ | __asm__ volatile( | |
1798 | "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride | ||
1799 | "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride | ||
1800 | "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride | ||
1801 | // 0 1 2 3 4 5 6 7 8 9 | ||
1802 | // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 | ||
1803 | //FIXME reorder? | ||
1804 | #ifdef L1_DIFF //needs mmx2 | ||
1805 | "movq (%0), %%mm0 \n\t" // L0 | ||
1806 | "psadbw (%1), %%mm0 \n\t" // |L0-R0| | ||
1807 | "movq (%0, %2), %%mm1 \n\t" // L1 | ||
1808 | "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| | ||
1809 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 | ||
1810 | "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| | ||
1811 | "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3 | ||
1812 | "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3| | ||
1813 | |||
1814 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 | ||
1815 | "paddw %%mm1, %%mm0 \n\t" | ||
1816 | "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| | ||
1817 | "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5 | ||
1818 | "paddw %%mm2, %%mm0 \n\t" | ||
1819 | "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5| | ||
1820 | "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6 | ||
1821 | "paddw %%mm3, %%mm0 \n\t" | ||
1822 | "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6| | ||
1823 | "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7 | ||
1824 | "paddw %%mm4, %%mm0 \n\t" | ||
1825 | "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7| | ||
1826 | "paddw %%mm5, %%mm6 \n\t" | ||
1827 | "paddw %%mm7, %%mm6 \n\t" | ||
1828 | "paddw %%mm6, %%mm0 \n\t" | ||
1829 | #else //L1_DIFF | ||
1830 | #if defined (FAST_L2_DIFF) | ||
1831 | "pcmpeqb %%mm7, %%mm7 \n\t" | ||
1832 | "movq "MANGLE(b80)", %%mm6 \n\t" | ||
1833 | "pxor %%mm0, %%mm0 \n\t" | ||
1834 | #define REAL_L2_DIFF_CORE(a, b)\ | ||
1835 | "movq " #a ", %%mm5 \n\t"\ | ||
1836 | "movq " #b ", %%mm2 \n\t"\ | ||
1837 | "pxor %%mm7, %%mm2 \n\t"\ | ||
1838 | PAVGB(%%mm2, %%mm5)\ | ||
1839 | "paddb %%mm6, %%mm5 \n\t"\ | ||
1840 | "movq %%mm5, %%mm2 \n\t"\ | ||
1841 | "psllw $8, %%mm5 \n\t"\ | ||
1842 | "pmaddwd %%mm5, %%mm5 \n\t"\ | ||
1843 | "pmaddwd %%mm2, %%mm2 \n\t"\ | ||
1844 | "paddd %%mm2, %%mm5 \n\t"\ | ||
1845 | "psrld $14, %%mm5 \n\t"\ | ||
1846 | "paddd %%mm5, %%mm0 \n\t" | ||
1847 | |||
1848 | #else //defined (FAST_L2_DIFF) | ||
1849 | "pxor %%mm7, %%mm7 \n\t" | ||
1850 | "pxor %%mm0, %%mm0 \n\t" | ||
1851 | #define REAL_L2_DIFF_CORE(a, b)\ | ||
1852 | "movq " #a ", %%mm5 \n\t"\ | ||
1853 | "movq " #b ", %%mm2 \n\t"\ | ||
1854 | "movq %%mm5, %%mm1 \n\t"\ | ||
1855 | "movq %%mm2, %%mm3 \n\t"\ | ||
1856 | "punpcklbw %%mm7, %%mm5 \n\t"\ | ||
1857 | "punpckhbw %%mm7, %%mm1 \n\t"\ | ||
1858 | "punpcklbw %%mm7, %%mm2 \n\t"\ | ||
1859 | "punpckhbw %%mm7, %%mm3 \n\t"\ | ||
1860 | "psubw %%mm2, %%mm5 \n\t"\ | ||
1861 | "psubw %%mm3, %%mm1 \n\t"\ | ||
1862 | "pmaddwd %%mm5, %%mm5 \n\t"\ | ||
1863 | "pmaddwd %%mm1, %%mm1 \n\t"\ | ||
1864 | "paddd %%mm1, %%mm5 \n\t"\ | ||
1865 | "paddd %%mm5, %%mm0 \n\t" | ||
1866 | |||
1867 | #endif //defined (FAST_L2_DIFF) | ||
1868 | |||
1869 | #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) | ||
1870 | |||
1871 | L2_DIFF_CORE((%0) , (%1)) | ||
1872 | L2_DIFF_CORE((%0, %2) , (%1, %2)) | ||
1873 | L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) | ||
1874 | L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa)) | ||
1875 | L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) | ||
1876 | L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd)) | ||
1877 | L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2)) | ||
1878 | L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc)) | ||
1879 | |||
1880 | #endif //L1_DIFF | ||
1881 | |||
1882 | "movq %%mm0, %%mm4 \n\t" | ||
1883 | "psrlq $32, %%mm0 \n\t" | ||
1884 | "paddd %%mm0, %%mm4 \n\t" | ||
1885 | "movd %%mm4, %%ecx \n\t" | ||
1886 | "shll $2, %%ecx \n\t" | ||
1887 | "mov %3, %%"FF_REG_d" \n\t" | ||
1888 | "addl -4(%%"FF_REG_d"), %%ecx \n\t" | ||
1889 | "addl 4(%%"FF_REG_d"), %%ecx \n\t" | ||
1890 | "addl -1024(%%"FF_REG_d"), %%ecx \n\t" | ||
1891 | "addl $4, %%ecx \n\t" | ||
1892 | "addl 1024(%%"FF_REG_d"), %%ecx \n\t" | ||
1893 | "shrl $3, %%ecx \n\t" | ||
1894 | "movl %%ecx, (%%"FF_REG_d") \n\t" | ||
1895 | |||
1896 | // "mov %3, %%"FF_REG_c" \n\t" | ||
1897 | // "mov %%"FF_REG_c", test \n\t" | ||
1898 | // "jmp 4f \n\t" | ||
1899 | "cmpl 512(%%"FF_REG_d"), %%ecx \n\t" | ||
1900 | " jb 2f \n\t" | ||
1901 | "cmpl 516(%%"FF_REG_d"), %%ecx \n\t" | ||
1902 | " jb 1f \n\t" | ||
1903 | |||
1904 | "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride | ||
1905 | "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride | ||
1906 | "movq (%0), %%mm0 \n\t" // L0 | ||
1907 | "movq (%0, %2), %%mm1 \n\t" // L1 | ||
1908 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 | ||
1909 | "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3 | ||
1910 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 | ||
1911 | "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5 | ||
1912 | "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6 | ||
1913 | "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7 | ||
1914 | "movq %%mm0, (%1) \n\t" // L0 | ||
1915 | "movq %%mm1, (%1, %2) \n\t" // L1 | ||
1916 | "movq %%mm2, (%1, %2, 2) \n\t" // L2 | ||
1917 | "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3 | ||
1918 | "movq %%mm4, (%1, %2, 4) \n\t" // L4 | ||
1919 | "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5 | ||
1920 | "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6 | ||
1921 | "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7 | ||
1922 | "jmp 4f \n\t" | ||
1923 | |||
1924 | "1: \n\t" | ||
1925 | "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride | ||
1926 | "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride | ||
1927 | "movq (%0), %%mm0 \n\t" // L0 | ||
1928 | PAVGB((%1), %%mm0) // L0 | ||
1929 | "movq (%0, %2), %%mm1 \n\t" // L1 | ||
1930 | PAVGB((%1, %2), %%mm1) // L1 | ||
1931 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 | ||
1932 | PAVGB((%1, %2, 2), %%mm2) // L2 | ||
1933 | "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3 | ||
1934 | PAVGB((%1, %%FF_REGa), %%mm3) // L3 | ||
1935 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 | ||
1936 | PAVGB((%1, %2, 4), %%mm4) // L4 | ||
1937 | "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5 | ||
1938 | PAVGB((%1, %%FF_REGd), %%mm5) // L5 | ||
1939 | "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6 | ||
1940 | PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6 | ||
1941 | "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7 | ||
1942 | PAVGB((%1, %%FF_REGc), %%mm7) // L7 | ||
1943 | "movq %%mm0, (%1) \n\t" // R0 | ||
1944 | "movq %%mm1, (%1, %2) \n\t" // R1 | ||
1945 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 | ||
1946 | "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3 | ||
1947 | "movq %%mm4, (%1, %2, 4) \n\t" // R4 | ||
1948 | "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5 | ||
1949 | "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6 | ||
1950 | "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7 | ||
1951 | "movq %%mm0, (%0) \n\t" // L0 | ||
1952 | "movq %%mm1, (%0, %2) \n\t" // L1 | ||
1953 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 | ||
1954 | "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3 | ||
1955 | "movq %%mm4, (%0, %2, 4) \n\t" // L4 | ||
1956 | "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5 | ||
1957 | "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6 | ||
1958 | "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7 | ||
1959 | "jmp 4f \n\t" | ||
1960 | |||
1961 | "2: \n\t" | ||
1962 | "cmpl 508(%%"FF_REG_d"), %%ecx \n\t" | ||
1963 | " jb 3f \n\t" | ||
1964 | |||
1965 | "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride | ||
1966 | "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride | ||
1967 | "movq (%0), %%mm0 \n\t" // L0 | ||
1968 | "movq (%0, %2), %%mm1 \n\t" // L1 | ||
1969 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 | ||
1970 | "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3 | ||
1971 | "movq (%1), %%mm4 \n\t" // R0 | ||
1972 | "movq (%1, %2), %%mm5 \n\t" // R1 | ||
1973 | "movq (%1, %2, 2), %%mm6 \n\t" // R2 | ||
1974 | "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3 | ||
1975 | PAVGB(%%mm4, %%mm0) | ||
1976 | PAVGB(%%mm5, %%mm1) | ||
1977 | PAVGB(%%mm6, %%mm2) | ||
1978 | PAVGB(%%mm7, %%mm3) | ||
1979 | PAVGB(%%mm4, %%mm0) | ||
1980 | PAVGB(%%mm5, %%mm1) | ||
1981 | PAVGB(%%mm6, %%mm2) | ||
1982 | PAVGB(%%mm7, %%mm3) | ||
1983 | "movq %%mm0, (%1) \n\t" // R0 | ||
1984 | "movq %%mm1, (%1, %2) \n\t" // R1 | ||
1985 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 | ||
1986 | "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3 | ||
1987 | "movq %%mm0, (%0) \n\t" // L0 | ||
1988 | "movq %%mm1, (%0, %2) \n\t" // L1 | ||
1989 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 | ||
1990 | "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3 | ||
1991 | |||
1992 | "movq (%0, %2, 4), %%mm0 \n\t" // L4 | ||
1993 | "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5 | ||
1994 | "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6 | ||
1995 | "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7 | ||
1996 | "movq (%1, %2, 4), %%mm4 \n\t" // R4 | ||
1997 | "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5 | ||
1998 | "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6 | ||
1999 | "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7 | ||
2000 | PAVGB(%%mm4, %%mm0) | ||
2001 | PAVGB(%%mm5, %%mm1) | ||
2002 | PAVGB(%%mm6, %%mm2) | ||
2003 | PAVGB(%%mm7, %%mm3) | ||
2004 | PAVGB(%%mm4, %%mm0) | ||
2005 | PAVGB(%%mm5, %%mm1) | ||
2006 | PAVGB(%%mm6, %%mm2) | ||
2007 | PAVGB(%%mm7, %%mm3) | ||
2008 | "movq %%mm0, (%1, %2, 4) \n\t" // R4 | ||
2009 | "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5 | ||
2010 | "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6 | ||
2011 | "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7 | ||
2012 | "movq %%mm0, (%0, %2, 4) \n\t" // L4 | ||
2013 | "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5 | ||
2014 | "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6 | ||
2015 | "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7 | ||
2016 | "jmp 4f \n\t" | ||
2017 | |||
2018 | "3: \n\t" | ||
2019 | "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride | ||
2020 | "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride | ||
2021 | "movq (%0), %%mm0 \n\t" // L0 | ||
2022 | "movq (%0, %2), %%mm1 \n\t" // L1 | ||
2023 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 | ||
2024 | "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3 | ||
2025 | "movq (%1), %%mm4 \n\t" // R0 | ||
2026 | "movq (%1, %2), %%mm5 \n\t" // R1 | ||
2027 | "movq (%1, %2, 2), %%mm6 \n\t" // R2 | ||
2028 | "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3 | ||
2029 | PAVGB(%%mm4, %%mm0) | ||
2030 | PAVGB(%%mm5, %%mm1) | ||
2031 | PAVGB(%%mm6, %%mm2) | ||
2032 | PAVGB(%%mm7, %%mm3) | ||
2033 | PAVGB(%%mm4, %%mm0) | ||
2034 | PAVGB(%%mm5, %%mm1) | ||
2035 | PAVGB(%%mm6, %%mm2) | ||
2036 | PAVGB(%%mm7, %%mm3) | ||
2037 | PAVGB(%%mm4, %%mm0) | ||
2038 | PAVGB(%%mm5, %%mm1) | ||
2039 | PAVGB(%%mm6, %%mm2) | ||
2040 | PAVGB(%%mm7, %%mm3) | ||
2041 | "movq %%mm0, (%1) \n\t" // R0 | ||
2042 | "movq %%mm1, (%1, %2) \n\t" // R1 | ||
2043 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 | ||
2044 | "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3 | ||
2045 | "movq %%mm0, (%0) \n\t" // L0 | ||
2046 | "movq %%mm1, (%0, %2) \n\t" // L1 | ||
2047 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 | ||
2048 | "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3 | ||
2049 | |||
2050 | "movq (%0, %2, 4), %%mm0 \n\t" // L4 | ||
2051 | "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5 | ||
2052 | "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6 | ||
2053 | "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7 | ||
2054 | "movq (%1, %2, 4), %%mm4 \n\t" // R4 | ||
2055 | "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5 | ||
2056 | "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6 | ||
2057 | "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7 | ||
2058 | PAVGB(%%mm4, %%mm0) | ||
2059 | PAVGB(%%mm5, %%mm1) | ||
2060 | PAVGB(%%mm6, %%mm2) | ||
2061 | PAVGB(%%mm7, %%mm3) | ||
2062 | PAVGB(%%mm4, %%mm0) | ||
2063 | PAVGB(%%mm5, %%mm1) | ||
2064 | PAVGB(%%mm6, %%mm2) | ||
2065 | PAVGB(%%mm7, %%mm3) | ||
2066 | PAVGB(%%mm4, %%mm0) | ||
2067 | PAVGB(%%mm5, %%mm1) | ||
2068 | PAVGB(%%mm6, %%mm2) | ||
2069 | PAVGB(%%mm7, %%mm3) | ||
2070 | "movq %%mm0, (%1, %2, 4) \n\t" // R4 | ||
2071 | "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5 | ||
2072 | "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6 | ||
2073 | "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7 | ||
2074 | "movq %%mm0, (%0, %2, 4) \n\t" // L4 | ||
2075 | "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5 | ||
2076 | "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6 | ||
2077 | "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7 | ||
2078 | |||
2079 | "4: \n\t" | ||
2080 | |||
2081 | ✗ | :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) | |
2082 | NAMED_CONSTRAINTS_ADD(b80) | ||
2083 | : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory" | ||
2084 | ); | ||
2085 | #else //TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
2086 | { | ||
2087 | int y; | ||
2088 | 71280 | int d=0; | |
2089 | // int sysd=0; | ||
2090 | int i; | ||
2091 | |||
2092 |
2/2✓ Branch 0 taken 285120 times.
✓ Branch 1 taken 35640 times.
|
641520 | for(y=0; y<8; y++){ |
2093 | int x; | ||
2094 |
2/2✓ Branch 0 taken 2280960 times.
✓ Branch 1 taken 285120 times.
|
5132160 | for(x=0; x<8; x++){ |
2095 | 4561920 | int ref= tempBlurred[ x + y*stride ]; | |
2096 | 4561920 | int cur= src[ x + y*stride ]; | |
2097 | 4561920 | int d1=ref - cur; | |
2098 | // if(x==0 || x==7) d1+= d1>>1; | ||
2099 | // if(y==0 || y==7) d1+= d1>>1; | ||
2100 | // d+= FFABS(d1); | ||
2101 | 4561920 | d+= d1*d1; | |
2102 | // sysd+= d1; | ||
2103 | } | ||
2104 | } | ||
2105 | 71280 | i=d; | |
2106 | 71280 | d= ( | |
2107 | 71280 | 4*d | |
2108 | 71280 | +(*(tempBlurredPast-256)) | |
2109 | 71280 | +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1)) | |
2110 | 71280 | +(*(tempBlurredPast+256)) | |
2111 | 71280 | +4)>>3; | |
2112 | 71280 | *tempBlurredPast=i; | |
2113 | // ((*tempBlurredPast)*3 + d + 2)>>2; | ||
2114 | |||
2115 | /* | ||
2116 | Switch between | ||
2117 | 1 0 0 0 0 0 0 (0) | ||
2118 | 64 32 16 8 4 2 1 (1) | ||
2119 | 64 48 36 27 20 15 11 (33) (approx) | ||
2120 | 64 56 49 43 37 33 29 (200) (approx) | ||
2121 | */ | ||
2122 |
1/2✓ Branch 0 taken 35640 times.
✗ Branch 1 not taken.
|
71280 | if(d > maxNoise[1]){ |
2123 |
2/2✓ Branch 0 taken 21 times.
✓ Branch 1 taken 35619 times.
|
71280 | if(d < maxNoise[2]){ |
2124 |
2/2✓ Branch 0 taken 168 times.
✓ Branch 1 taken 21 times.
|
378 | for(y=0; y<8; y++){ |
2125 | int x; | ||
2126 |
2/2✓ Branch 0 taken 1344 times.
✓ Branch 1 taken 168 times.
|
3024 | for(x=0; x<8; x++){ |
2127 | 2688 | int ref= tempBlurred[ x + y*stride ]; | |
2128 | 2688 | int cur= src[ x + y*stride ]; | |
2129 | 2688 | tempBlurred[ x + y*stride ]= | |
2130 | 2688 | src[ x + y*stride ]= | |
2131 | 2688 | (ref + cur + 1)>>1; | |
2132 | } | ||
2133 | } | ||
2134 | }else{ | ||
2135 |
2/2✓ Branch 0 taken 284952 times.
✓ Branch 1 taken 35619 times.
|
641142 | for(y=0; y<8; y++){ |
2136 | int x; | ||
2137 |
2/2✓ Branch 0 taken 2279616 times.
✓ Branch 1 taken 284952 times.
|
5129136 | for(x=0; x<8; x++){ |
2138 | 4559232 | tempBlurred[ x + y*stride ]= src[ x + y*stride ]; | |
2139 | } | ||
2140 | } | ||
2141 | } | ||
2142 | }else{ | ||
2143 | ✗ | if(d < maxNoise[0]){ | |
2144 | ✗ | for(y=0; y<8; y++){ | |
2145 | int x; | ||
2146 | ✗ | for(x=0; x<8; x++){ | |
2147 | ✗ | int ref= tempBlurred[ x + y*stride ]; | |
2148 | ✗ | int cur= src[ x + y*stride ]; | |
2149 | ✗ | tempBlurred[ x + y*stride ]= | |
2150 | ✗ | src[ x + y*stride ]= | |
2151 | ✗ | (ref*7 + cur + 4)>>3; | |
2152 | } | ||
2153 | } | ||
2154 | }else{ | ||
2155 | ✗ | for(y=0; y<8; y++){ | |
2156 | int x; | ||
2157 | ✗ | for(x=0; x<8; x++){ | |
2158 | ✗ | int ref= tempBlurred[ x + y*stride ]; | |
2159 | ✗ | int cur= src[ x + y*stride ]; | |
2160 | ✗ | tempBlurred[ x + y*stride ]= | |
2161 | ✗ | src[ x + y*stride ]= | |
2162 | ✗ | (ref*3 + cur + 2)>>2; | |
2163 | } | ||
2164 | } | ||
2165 | } | ||
2166 | } | ||
2167 | } | ||
2168 | #endif //TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
2169 | 71280 | } | |
2170 | #endif //TEMPLATE_PP_ALTIVEC | ||
2171 | |||
2172 | #if TEMPLATE_PP_MMXEXT | ||
2173 | /** | ||
2174 | * accurate deblock filter | ||
2175 | */ | ||
2176 | ✗ | static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){ | |
2177 | int64_t dc_mask, eq_mask, both_masks; | ||
2178 | int64_t sums[10*8*2]; | ||
2179 | ✗ | src+= step*3; // src points to begin of the 8x8 Block | |
2180 | |||
2181 | ✗ | __asm__ volatile( | |
2182 | "movq %0, %%mm7 \n\t" | ||
2183 | "movq %1, %%mm6 \n\t" | ||
2184 | ✗ | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
2185 | ); | ||
2186 | |||
2187 | ✗ | __asm__ volatile( | |
2188 | "lea (%2, %3), %%"FF_REG_a" \n\t" | ||
2189 | // 0 1 2 3 4 5 6 7 8 9 | ||
2190 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 | ||
2191 | |||
2192 | "movq (%2), %%mm0 \n\t" | ||
2193 | "movq (%%"FF_REG_a"), %%mm1 \n\t" | ||
2194 | "movq %%mm1, %%mm3 \n\t" | ||
2195 | "movq %%mm1, %%mm4 \n\t" | ||
2196 | "psubb %%mm1, %%mm0 \n\t" // mm0 = difference | ||
2197 | "paddb %%mm7, %%mm0 \n\t" | ||
2198 | "pcmpgtb %%mm6, %%mm0 \n\t" | ||
2199 | |||
2200 | "movq (%%"FF_REG_a",%3), %%mm2 \n\t" | ||
2201 | PMAXUB(%%mm2, %%mm4) | ||
2202 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
2203 | "psubb %%mm2, %%mm1 \n\t" | ||
2204 | "paddb %%mm7, %%mm1 \n\t" | ||
2205 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
2206 | "paddb %%mm1, %%mm0 \n\t" | ||
2207 | |||
2208 | "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t" | ||
2209 | PMAXUB(%%mm1, %%mm4) | ||
2210 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
2211 | "psubb %%mm1, %%mm2 \n\t" | ||
2212 | "paddb %%mm7, %%mm2 \n\t" | ||
2213 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
2214 | "paddb %%mm2, %%mm0 \n\t" | ||
2215 | |||
2216 | "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t" | ||
2217 | |||
2218 | "movq (%2, %3, 4), %%mm2 \n\t" | ||
2219 | PMAXUB(%%mm2, %%mm4) | ||
2220 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
2221 | "psubb %%mm2, %%mm1 \n\t" | ||
2222 | "paddb %%mm7, %%mm1 \n\t" | ||
2223 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
2224 | "paddb %%mm1, %%mm0 \n\t" | ||
2225 | |||
2226 | "movq (%%"FF_REG_a"), %%mm1 \n\t" | ||
2227 | PMAXUB(%%mm1, %%mm4) | ||
2228 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
2229 | "psubb %%mm1, %%mm2 \n\t" | ||
2230 | "paddb %%mm7, %%mm2 \n\t" | ||
2231 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
2232 | "paddb %%mm2, %%mm0 \n\t" | ||
2233 | |||
2234 | "movq (%%"FF_REG_a", %3), %%mm2 \n\t" | ||
2235 | PMAXUB(%%mm2, %%mm4) | ||
2236 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
2237 | "psubb %%mm2, %%mm1 \n\t" | ||
2238 | "paddb %%mm7, %%mm1 \n\t" | ||
2239 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
2240 | "paddb %%mm1, %%mm0 \n\t" | ||
2241 | |||
2242 | "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t" | ||
2243 | PMAXUB(%%mm1, %%mm4) | ||
2244 | PMINUB(%%mm1, %%mm3, %%mm5) | ||
2245 | "psubb %%mm1, %%mm2 \n\t" | ||
2246 | "paddb %%mm7, %%mm2 \n\t" | ||
2247 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
2248 | "paddb %%mm2, %%mm0 \n\t" | ||
2249 | |||
2250 | "movq (%2, %3, 8), %%mm2 \n\t" | ||
2251 | PMAXUB(%%mm2, %%mm4) | ||
2252 | PMINUB(%%mm2, %%mm3, %%mm5) | ||
2253 | "psubb %%mm2, %%mm1 \n\t" | ||
2254 | "paddb %%mm7, %%mm1 \n\t" | ||
2255 | "pcmpgtb %%mm6, %%mm1 \n\t" | ||
2256 | "paddb %%mm1, %%mm0 \n\t" | ||
2257 | |||
2258 | "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t" | ||
2259 | "psubb %%mm1, %%mm2 \n\t" | ||
2260 | "paddb %%mm7, %%mm2 \n\t" | ||
2261 | "pcmpgtb %%mm6, %%mm2 \n\t" | ||
2262 | "paddb %%mm2, %%mm0 \n\t" | ||
2263 | "psubusb %%mm3, %%mm4 \n\t" | ||
2264 | |||
2265 | "pxor %%mm6, %%mm6 \n\t" | ||
2266 | "movq %4, %%mm7 \n\t" // QP,..., QP | ||
2267 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | ||
2268 | "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 | ||
2269 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | ||
2270 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 | ||
2271 | "movq %%mm7, %1 \n\t" | ||
2272 | |||
2273 | "movq %5, %%mm7 \n\t" | ||
2274 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
2275 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
2276 | "punpcklbw %%mm7, %%mm7 \n\t" | ||
2277 | "psubb %%mm0, %%mm6 \n\t" | ||
2278 | "pcmpgtb %%mm7, %%mm6 \n\t" | ||
2279 | "movq %%mm6, %0 \n\t" | ||
2280 | |||
2281 | : "=m" (eq_mask), "=m" (dc_mask) | ||
2282 | ✗ | : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) | |
2283 | : "%"FF_REG_a | ||
2284 | ); | ||
2285 | |||
2286 | ✗ | both_masks = dc_mask & eq_mask; | |
2287 | |||
2288 | ✗ | if(both_masks){ | |
2289 | ✗ | x86_reg offset= -8*step; | |
2290 | ✗ | int64_t *temp_sums= sums; | |
2291 | |||
2292 | ✗ | __asm__ volatile( | |
2293 | "movq %2, %%mm0 \n\t" // QP,..., QP | ||
2294 | "pxor %%mm4, %%mm4 \n\t" | ||
2295 | |||
2296 | "movq (%0), %%mm6 \n\t" | ||
2297 | "movq (%0, %1), %%mm5 \n\t" | ||
2298 | "movq %%mm5, %%mm1 \n\t" | ||
2299 | "movq %%mm6, %%mm2 \n\t" | ||
2300 | "psubusb %%mm6, %%mm5 \n\t" | ||
2301 | "psubusb %%mm1, %%mm2 \n\t" | ||
2302 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | ||
2303 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | ||
2304 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | ||
2305 | |||
2306 | "pxor %%mm6, %%mm1 \n\t" | ||
2307 | "pand %%mm0, %%mm1 \n\t" | ||
2308 | "pxor %%mm1, %%mm6 \n\t" | ||
2309 | // 0:QP 6:First | ||
2310 | |||
2311 | "movq (%0, %1, 8), %%mm5 \n\t" | ||
2312 | "add %1, %0 \n\t" // %0 points to line 1 not 0 | ||
2313 | "movq (%0, %1, 8), %%mm7 \n\t" | ||
2314 | "movq %%mm5, %%mm1 \n\t" | ||
2315 | "movq %%mm7, %%mm2 \n\t" | ||
2316 | "psubusb %%mm7, %%mm5 \n\t" | ||
2317 | "psubusb %%mm1, %%mm2 \n\t" | ||
2318 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | ||
2319 | "movq %2, %%mm0 \n\t" // QP,..., QP | ||
2320 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | ||
2321 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | ||
2322 | |||
2323 | "pxor %%mm7, %%mm1 \n\t" | ||
2324 | "pand %%mm0, %%mm1 \n\t" | ||
2325 | "pxor %%mm1, %%mm7 \n\t" | ||
2326 | |||
2327 | "movq %%mm6, %%mm5 \n\t" | ||
2328 | "punpckhbw %%mm4, %%mm6 \n\t" | ||
2329 | "punpcklbw %%mm4, %%mm5 \n\t" | ||
2330 | // 4:0 5/6:First 7:Last | ||
2331 | |||
2332 | "movq %%mm5, %%mm0 \n\t" | ||
2333 | "movq %%mm6, %%mm1 \n\t" | ||
2334 | "psllw $2, %%mm0 \n\t" | ||
2335 | "psllw $2, %%mm1 \n\t" | ||
2336 | "paddw "MANGLE(w04)", %%mm0 \n\t" | ||
2337 | "paddw "MANGLE(w04)", %%mm1 \n\t" | ||
2338 | |||
2339 | #define NEXT\ | ||
2340 | "movq (%0), %%mm2 \n\t"\ | ||
2341 | "movq (%0), %%mm3 \n\t"\ | ||
2342 | "add %1, %0 \n\t"\ | ||
2343 | "punpcklbw %%mm4, %%mm2 \n\t"\ | ||
2344 | "punpckhbw %%mm4, %%mm3 \n\t"\ | ||
2345 | "paddw %%mm2, %%mm0 \n\t"\ | ||
2346 | "paddw %%mm3, %%mm1 \n\t" | ||
2347 | |||
2348 | #define PREV\ | ||
2349 | "movq (%0), %%mm2 \n\t"\ | ||
2350 | "movq (%0), %%mm3 \n\t"\ | ||
2351 | "add %1, %0 \n\t"\ | ||
2352 | "punpcklbw %%mm4, %%mm2 \n\t"\ | ||
2353 | "punpckhbw %%mm4, %%mm3 \n\t"\ | ||
2354 | "psubw %%mm2, %%mm0 \n\t"\ | ||
2355 | "psubw %%mm3, %%mm1 \n\t" | ||
2356 | |||
2357 | |||
2358 | NEXT //0 | ||
2359 | NEXT //1 | ||
2360 | NEXT //2 | ||
2361 | "movq %%mm0, (%3) \n\t" | ||
2362 | "movq %%mm1, 8(%3) \n\t" | ||
2363 | |||
2364 | NEXT //3 | ||
2365 | "psubw %%mm5, %%mm0 \n\t" | ||
2366 | "psubw %%mm6, %%mm1 \n\t" | ||
2367 | "movq %%mm0, 16(%3) \n\t" | ||
2368 | "movq %%mm1, 24(%3) \n\t" | ||
2369 | |||
2370 | NEXT //4 | ||
2371 | "psubw %%mm5, %%mm0 \n\t" | ||
2372 | "psubw %%mm6, %%mm1 \n\t" | ||
2373 | "movq %%mm0, 32(%3) \n\t" | ||
2374 | "movq %%mm1, 40(%3) \n\t" | ||
2375 | |||
2376 | NEXT //5 | ||
2377 | "psubw %%mm5, %%mm0 \n\t" | ||
2378 | "psubw %%mm6, %%mm1 \n\t" | ||
2379 | "movq %%mm0, 48(%3) \n\t" | ||
2380 | "movq %%mm1, 56(%3) \n\t" | ||
2381 | |||
2382 | NEXT //6 | ||
2383 | "psubw %%mm5, %%mm0 \n\t" | ||
2384 | "psubw %%mm6, %%mm1 \n\t" | ||
2385 | "movq %%mm0, 64(%3) \n\t" | ||
2386 | "movq %%mm1, 72(%3) \n\t" | ||
2387 | |||
2388 | "movq %%mm7, %%mm6 \n\t" | ||
2389 | "punpckhbw %%mm4, %%mm7 \n\t" | ||
2390 | "punpcklbw %%mm4, %%mm6 \n\t" | ||
2391 | |||
2392 | NEXT //7 | ||
2393 | "mov %4, %0 \n\t" | ||
2394 | "add %1, %0 \n\t" | ||
2395 | PREV //0 | ||
2396 | "movq %%mm0, 80(%3) \n\t" | ||
2397 | "movq %%mm1, 88(%3) \n\t" | ||
2398 | |||
2399 | PREV //1 | ||
2400 | "paddw %%mm6, %%mm0 \n\t" | ||
2401 | "paddw %%mm7, %%mm1 \n\t" | ||
2402 | "movq %%mm0, 96(%3) \n\t" | ||
2403 | "movq %%mm1, 104(%3) \n\t" | ||
2404 | |||
2405 | PREV //2 | ||
2406 | "paddw %%mm6, %%mm0 \n\t" | ||
2407 | "paddw %%mm7, %%mm1 \n\t" | ||
2408 | "movq %%mm0, 112(%3) \n\t" | ||
2409 | "movq %%mm1, 120(%3) \n\t" | ||
2410 | |||
2411 | PREV //3 | ||
2412 | "paddw %%mm6, %%mm0 \n\t" | ||
2413 | "paddw %%mm7, %%mm1 \n\t" | ||
2414 | "movq %%mm0, 128(%3) \n\t" | ||
2415 | "movq %%mm1, 136(%3) \n\t" | ||
2416 | |||
2417 | PREV //4 | ||
2418 | "paddw %%mm6, %%mm0 \n\t" | ||
2419 | "paddw %%mm7, %%mm1 \n\t" | ||
2420 | "movq %%mm0, 144(%3) \n\t" | ||
2421 | "movq %%mm1, 152(%3) \n\t" | ||
2422 | |||
2423 | "mov %4, %0 \n\t" //FIXME | ||
2424 | |||
2425 | : "+&r"(src) | ||
2426 | ✗ | : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src) | |
2427 | NAMED_CONSTRAINTS_ADD(w04) | ||
2428 | ); | ||
2429 | |||
2430 | ✗ | src+= step; // src points to begin of the 8x8 Block | |
2431 | |||
2432 | ✗ | __asm__ volatile( | |
2433 | "movq %4, %%mm6 \n\t" | ||
2434 | "pcmpeqb %%mm5, %%mm5 \n\t" | ||
2435 | "pxor %%mm6, %%mm5 \n\t" | ||
2436 | "pxor %%mm7, %%mm7 \n\t" | ||
2437 | |||
2438 | "1: \n\t" | ||
2439 | "movq (%1), %%mm0 \n\t" | ||
2440 | "movq 8(%1), %%mm1 \n\t" | ||
2441 | "paddw 32(%1), %%mm0 \n\t" | ||
2442 | "paddw 40(%1), %%mm1 \n\t" | ||
2443 | "movq (%0, %3), %%mm2 \n\t" | ||
2444 | "movq %%mm2, %%mm3 \n\t" | ||
2445 | "movq %%mm2, %%mm4 \n\t" | ||
2446 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2447 | "punpckhbw %%mm7, %%mm3 \n\t" | ||
2448 | "paddw %%mm2, %%mm0 \n\t" | ||
2449 | "paddw %%mm3, %%mm1 \n\t" | ||
2450 | "paddw %%mm2, %%mm0 \n\t" | ||
2451 | "paddw %%mm3, %%mm1 \n\t" | ||
2452 | "psrlw $4, %%mm0 \n\t" | ||
2453 | "psrlw $4, %%mm1 \n\t" | ||
2454 | "packuswb %%mm1, %%mm0 \n\t" | ||
2455 | "pand %%mm6, %%mm0 \n\t" | ||
2456 | "pand %%mm5, %%mm4 \n\t" | ||
2457 | "por %%mm4, %%mm0 \n\t" | ||
2458 | "movq %%mm0, (%0, %3) \n\t" | ||
2459 | "add $16, %1 \n\t" | ||
2460 | "add %2, %0 \n\t" | ||
2461 | " js 1b \n\t" | ||
2462 | |||
2463 | : "+r"(offset), "+r"(temp_sums) | ||
2464 | ✗ | : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks) | |
2465 | ); | ||
2466 | }else | ||
2467 | ✗ | src+= step; // src points to begin of the 8x8 Block | |
2468 | |||
2469 | ✗ | if(eq_mask != -1LL){ | |
2470 | ✗ | uint8_t *temp_src= src; | |
2471 | DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars | ||
2472 | ✗ | __asm__ volatile( | |
2473 | "pxor %%mm7, %%mm7 \n\t" | ||
2474 | // 0 1 2 3 4 5 6 7 8 9 | ||
2475 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 | ||
2476 | |||
2477 | "movq (%0), %%mm0 \n\t" | ||
2478 | "movq %%mm0, %%mm1 \n\t" | ||
2479 | "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | ||
2480 | "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | ||
2481 | |||
2482 | "movq (%0, %1), %%mm2 \n\t" | ||
2483 | "lea (%0, %1, 2), %%"FF_REG_a" \n\t" | ||
2484 | "movq %%mm2, %%mm3 \n\t" | ||
2485 | "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | ||
2486 | "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | ||
2487 | |||
2488 | "movq (%%"FF_REG_a"), %%mm4 \n\t" | ||
2489 | "movq %%mm4, %%mm5 \n\t" | ||
2490 | "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | ||
2491 | "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | ||
2492 | |||
2493 | "paddw %%mm0, %%mm0 \n\t" // 2L0 | ||
2494 | "paddw %%mm1, %%mm1 \n\t" // 2H0 | ||
2495 | "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | ||
2496 | "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | ||
2497 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | ||
2498 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | ||
2499 | |||
2500 | "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | ||
2501 | "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | ||
2502 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | ||
2503 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | ||
2504 | |||
2505 | "movq (%%"FF_REG_a", %1), %%mm2 \n\t" | ||
2506 | "movq %%mm2, %%mm3 \n\t" | ||
2507 | "punpcklbw %%mm7, %%mm2 \n\t" // L3 | ||
2508 | "punpckhbw %%mm7, %%mm3 \n\t" // H3 | ||
2509 | |||
2510 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | ||
2511 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | ||
2512 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | ||
2513 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | ||
2514 | "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | ||
2515 | "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | ||
2516 | |||
2517 | "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" | ||
2518 | "movq %%mm0, %%mm1 \n\t" | ||
2519 | "punpcklbw %%mm7, %%mm0 \n\t" // L4 | ||
2520 | "punpckhbw %%mm7, %%mm1 \n\t" // H4 | ||
2521 | |||
2522 | "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | ||
2523 | "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | ||
2524 | "movq %%mm2, 16(%4) \n\t" // L3 - L4 | ||
2525 | "movq %%mm3, 24(%4) \n\t" // H3 - H4 | ||
2526 | "paddw %%mm4, %%mm4 \n\t" // 2L2 | ||
2527 | "paddw %%mm5, %%mm5 \n\t" // 2H2 | ||
2528 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | ||
2529 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | ||
2530 | |||
2531 | "lea (%%"FF_REG_a", %1), %0 \n\t" | ||
2532 | "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | ||
2533 | "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | ||
2534 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | ||
2535 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | ||
2536 | //50 opcodes so far | ||
2537 | "movq (%0, %1, 2), %%mm2 \n\t" | ||
2538 | "movq %%mm2, %%mm3 \n\t" | ||
2539 | "punpcklbw %%mm7, %%mm2 \n\t" // L5 | ||
2540 | "punpckhbw %%mm7, %%mm3 \n\t" // H5 | ||
2541 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | ||
2542 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | ||
2543 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | ||
2544 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | ||
2545 | |||
2546 | "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" | ||
2547 | "punpcklbw %%mm7, %%mm6 \n\t" // L6 | ||
2548 | "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | ||
2549 | "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" | ||
2550 | "punpckhbw %%mm7, %%mm6 \n\t" // H6 | ||
2551 | "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | ||
2552 | |||
2553 | "paddw %%mm0, %%mm0 \n\t" // 2L4 | ||
2554 | "paddw %%mm1, %%mm1 \n\t" // 2H4 | ||
2555 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | ||
2556 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | ||
2557 | |||
2558 | "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | ||
2559 | "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | ||
2560 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | ||
2561 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | ||
2562 | |||
2563 | "movq (%0, %1, 4), %%mm2 \n\t" | ||
2564 | "movq %%mm2, %%mm3 \n\t" | ||
2565 | "punpcklbw %%mm7, %%mm2 \n\t" // L7 | ||
2566 | "punpckhbw %%mm7, %%mm3 \n\t" // H7 | ||
2567 | |||
2568 | "paddw %%mm2, %%mm2 \n\t" // 2L7 | ||
2569 | "paddw %%mm3, %%mm3 \n\t" // 2H7 | ||
2570 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | ||
2571 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | ||
2572 | |||
2573 | "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | ||
2574 | "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | ||
2575 | |||
2576 | "movq %%mm7, %%mm6 \n\t" // 0 | ||
2577 | "psubw %%mm0, %%mm6 \n\t" | ||
2578 | "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | ||
2579 | "movq %%mm7, %%mm6 \n\t" // 0 | ||
2580 | "psubw %%mm1, %%mm6 \n\t" | ||
2581 | "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | ||
2582 | "movq %%mm7, %%mm6 \n\t" // 0 | ||
2583 | "psubw %%mm2, %%mm6 \n\t" | ||
2584 | "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | ||
2585 | "movq %%mm7, %%mm6 \n\t" // 0 | ||
2586 | "psubw %%mm3, %%mm6 \n\t" | ||
2587 | "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | ||
2588 | |||
2589 | "pminsw %%mm2, %%mm0 \n\t" | ||
2590 | "pminsw %%mm3, %%mm1 \n\t" | ||
2591 | |||
2592 | "movd %2, %%mm2 \n\t" // QP | ||
2593 | "punpcklbw %%mm7, %%mm2 \n\t" | ||
2594 | |||
2595 | "movq %%mm7, %%mm6 \n\t" // 0 | ||
2596 | "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | ||
2597 | "pxor %%mm6, %%mm4 \n\t" | ||
2598 | "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | ||
2599 | "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | ||
2600 | "pxor %%mm7, %%mm5 \n\t" | ||
2601 | "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | ||
2602 | // 100 opcodes | ||
2603 | "psllw $3, %%mm2 \n\t" // 8QP | ||
2604 | "movq %%mm2, %%mm3 \n\t" // 8QP | ||
2605 | "pcmpgtw %%mm4, %%mm2 \n\t" | ||
2606 | "pcmpgtw %%mm5, %%mm3 \n\t" | ||
2607 | "pand %%mm2, %%mm4 \n\t" | ||
2608 | "pand %%mm3, %%mm5 \n\t" | ||
2609 | |||
2610 | |||
2611 | "psubusw %%mm0, %%mm4 \n\t" // hd | ||
2612 | "psubusw %%mm1, %%mm5 \n\t" // ld | ||
2613 | |||
2614 | |||
2615 | "movq "MANGLE(w05)", %%mm2 \n\t" // 5 | ||
2616 | "pmullw %%mm2, %%mm4 \n\t" | ||
2617 | "pmullw %%mm2, %%mm5 \n\t" | ||
2618 | "movq "MANGLE(w20)", %%mm2 \n\t" // 32 | ||
2619 | "paddw %%mm2, %%mm4 \n\t" | ||
2620 | "paddw %%mm2, %%mm5 \n\t" | ||
2621 | "psrlw $6, %%mm4 \n\t" | ||
2622 | "psrlw $6, %%mm5 \n\t" | ||
2623 | |||
2624 | "movq 16(%4), %%mm0 \n\t" // L3 - L4 | ||
2625 | "movq 24(%4), %%mm1 \n\t" // H3 - H4 | ||
2626 | |||
2627 | "pxor %%mm2, %%mm2 \n\t" | ||
2628 | "pxor %%mm3, %%mm3 \n\t" | ||
2629 | |||
2630 | "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | ||
2631 | "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | ||
2632 | "pxor %%mm2, %%mm0 \n\t" | ||
2633 | "pxor %%mm3, %%mm1 \n\t" | ||
2634 | "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | ||
2635 | "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | ||
2636 | "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | ||
2637 | "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | ||
2638 | |||
2639 | "pxor %%mm6, %%mm2 \n\t" | ||
2640 | "pxor %%mm7, %%mm3 \n\t" | ||
2641 | "pand %%mm2, %%mm4 \n\t" | ||
2642 | "pand %%mm3, %%mm5 \n\t" | ||
2643 | |||
2644 | "pminsw %%mm0, %%mm4 \n\t" | ||
2645 | "pminsw %%mm1, %%mm5 \n\t" | ||
2646 | "pxor %%mm6, %%mm4 \n\t" | ||
2647 | "pxor %%mm7, %%mm5 \n\t" | ||
2648 | "psubw %%mm6, %%mm4 \n\t" | ||
2649 | "psubw %%mm7, %%mm5 \n\t" | ||
2650 | "packsswb %%mm5, %%mm4 \n\t" | ||
2651 | "movq %3, %%mm1 \n\t" | ||
2652 | "pandn %%mm4, %%mm1 \n\t" | ||
2653 | "movq (%0), %%mm0 \n\t" | ||
2654 | "paddb %%mm1, %%mm0 \n\t" | ||
2655 | "movq %%mm0, (%0) \n\t" | ||
2656 | "movq (%0, %1), %%mm0 \n\t" | ||
2657 | "psubb %%mm1, %%mm0 \n\t" | ||
2658 | "movq %%mm0, (%0, %1) \n\t" | ||
2659 | |||
2660 | : "+r" (temp_src) | ||
2661 | ✗ | : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp) | |
2662 | NAMED_CONSTRAINTS_ADD(w05,w20) | ||
2663 | : "%"FF_REG_a | ||
2664 | ); | ||
2665 | } | ||
2666 | ✗ | } | |
2667 | #endif //TEMPLATE_PP_MMX | ||
2668 | |||
2669 | static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | ||
2670 | const int8_t QPs[], int QPStride, int isColor, PPContext *c); | ||
2671 | |||
2672 | /** | ||
2673 | * Copy a block from src to dst and fixes the blacklevel. | ||
2674 | * levelFix == 0 -> do not touch the brightness & contrast | ||
2675 | */ | ||
2676 | #undef REAL_SCALED_CPY | ||
2677 | #undef SCALED_CPY | ||
2678 | |||
2679 | 197120 | static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, | |
2680 | int levelFix, int64_t *packedOffsetAndScale) | ||
2681 | { | ||
2682 |
2/2✓ Branch 0 taken 24420 times.
✓ Branch 1 taken 74140 times.
|
197120 | if(levelFix){ |
2683 | #if TEMPLATE_PP_MMXEXT && HAVE_6REGS | ||
2684 | ✗ | __asm__ volatile( | |
2685 | "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset | ||
2686 | "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale | ||
2687 | "lea (%2,%4), %%"FF_REG_a" \n\t" | ||
2688 | "lea (%3,%5), %%"FF_REG_d" \n\t" | ||
2689 | "pxor %%mm4, %%mm4 \n\t" | ||
2690 | #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ | ||
2691 | "movq " #src1 ", %%mm0 \n\t"\ | ||
2692 | "movq " #src1 ", %%mm5 \n\t"\ | ||
2693 | "movq " #src2 ", %%mm1 \n\t"\ | ||
2694 | "movq " #src2 ", %%mm6 \n\t"\ | ||
2695 | "punpcklbw %%mm0, %%mm0 \n\t"\ | ||
2696 | "punpckhbw %%mm5, %%mm5 \n\t"\ | ||
2697 | "punpcklbw %%mm1, %%mm1 \n\t"\ | ||
2698 | "punpckhbw %%mm6, %%mm6 \n\t"\ | ||
2699 | "pmulhuw %%mm3, %%mm0 \n\t"\ | ||
2700 | "pmulhuw %%mm3, %%mm5 \n\t"\ | ||
2701 | "pmulhuw %%mm3, %%mm1 \n\t"\ | ||
2702 | "pmulhuw %%mm3, %%mm6 \n\t"\ | ||
2703 | "psubw %%mm2, %%mm0 \n\t"\ | ||
2704 | "psubw %%mm2, %%mm5 \n\t"\ | ||
2705 | "psubw %%mm2, %%mm1 \n\t"\ | ||
2706 | "psubw %%mm2, %%mm6 \n\t"\ | ||
2707 | "packuswb %%mm5, %%mm0 \n\t"\ | ||
2708 | "packuswb %%mm6, %%mm1 \n\t"\ | ||
2709 | "movq %%mm0, " #dst1 " \n\t"\ | ||
2710 | "movq %%mm1, " #dst2 " \n\t"\ | ||
2711 | |||
2712 | #define SCALED_CPY(src1, src2, dst1, dst2)\ | ||
2713 | REAL_SCALED_CPY(src1, src2, dst1, dst2) | ||
2714 | |||
2715 | SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) | ||
2716 | SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2)) | ||
2717 | SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4)) | ||
2718 | "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t" | ||
2719 | "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t" | ||
2720 | SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2)) | ||
2721 | |||
2722 | |||
2723 | : "=&a" (packedOffsetAndScale) | ||
2724 | : "0" (packedOffsetAndScale), | ||
2725 | "r"(src), | ||
2726 | "r"(dst), | ||
2727 | ✗ | "r" ((x86_reg)srcStride), | |
2728 | ✗ | "r" ((x86_reg)dstStride) | |
2729 | : "%"FF_REG_d | ||
2730 | ); | ||
2731 | #else //TEMPLATE_PP_MMX && HAVE_6REGS | ||
2732 |
2/2✓ Branch 0 taken 195360 times.
✓ Branch 1 taken 24420 times.
|
439560 | for (int i = 0; i < 8; i++) |
2733 | 390720 | memcpy( &(dst[dstStride*i]), | |
2734 | 390720 | &(src[srcStride*i]), BLOCK_SIZE); | |
2735 | #endif //TEMPLATE_PP_MMX && HAVE_6REGS | ||
2736 | }else{ | ||
2737 | #if TEMPLATE_PP_MMX && HAVE_6REGS | ||
2738 | ✗ | __asm__ volatile( | |
2739 | "lea (%0,%2), %%"FF_REG_a" \n\t" | ||
2740 | "lea (%1,%3), %%"FF_REG_d" \n\t" | ||
2741 | |||
2742 | #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ | ||
2743 | "movq " #src1 ", %%mm0 \n\t"\ | ||
2744 | "movq " #src2 ", %%mm1 \n\t"\ | ||
2745 | "movq %%mm0, " #dst1 " \n\t"\ | ||
2746 | "movq %%mm1, " #dst2 " \n\t"\ | ||
2747 | |||
2748 | #define SIMPLE_CPY(src1, src2, dst1, dst2)\ | ||
2749 | REAL_SIMPLE_CPY(src1, src2, dst1, dst2) | ||
2750 | |||
2751 | SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | ||
2752 | SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2)) | ||
2753 | SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4)) | ||
2754 | "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t" | ||
2755 | "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t" | ||
2756 | SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2)) | ||
2757 | |||
2758 | : : "r" (src), | ||
2759 | "r" (dst), | ||
2760 | ✗ | "r" ((x86_reg)srcStride), | |
2761 | ✗ | "r" ((x86_reg)dstStride) | |
2762 | : "%"FF_REG_a, "%"FF_REG_d | ||
2763 | ); | ||
2764 | #else //TEMPLATE_PP_MMX && HAVE_6REGS | ||
2765 |
2/2✓ Branch 0 taken 593120 times.
✓ Branch 1 taken 74140 times.
|
1334520 | for (int i = 0; i < 8; i++) |
2766 | 1186240 | memcpy( &(dst[dstStride*i]), | |
2767 | 1186240 | &(src[srcStride*i]), BLOCK_SIZE); | |
2768 | #endif //TEMPLATE_PP_MMX && HAVE_6REGS | ||
2769 | } | ||
2770 | 197120 | } | |
2771 | |||
2772 | /** | ||
2773 | * Duplicate the given 8 src pixels ? times upward | ||
2774 | */ | ||
2775 | 7040 | static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
2776 | { | ||
2777 | #if TEMPLATE_PP_MMX | ||
2778 | ✗ | __asm__ volatile( | |
2779 | "movq (%0), %%mm0 \n\t" | ||
2780 | "movq %%mm0, (%0, %1, 4) \n\t" | ||
2781 | "add %1, %0 \n\t" | ||
2782 | "movq %%mm0, (%0) \n\t" | ||
2783 | "movq %%mm0, (%0, %1) \n\t" | ||
2784 | "movq %%mm0, (%0, %1, 2) \n\t" | ||
2785 | "movq %%mm0, (%0, %1, 4) \n\t" | ||
2786 | : "+r" (src) | ||
2787 | ✗ | : "r" ((x86_reg)-stride) | |
2788 | ); | ||
2789 | #else | ||
2790 | int i; | ||
2791 | 7040 | uint8_t *p=src; | |
2792 |
2/2✓ Branch 0 taken 17600 times.
✓ Branch 1 taken 3520 times.
|
42240 | for(i=0; i<5; i++){ |
2793 | 35200 | p-= stride; | |
2794 | 35200 | memcpy(p, src, 8); | |
2795 | } | ||
2796 | #endif | ||
2797 | 7040 | } | |
2798 | |||
2799 | #if ARCH_X86 && TEMPLATE_PP_MMXEXT | ||
2800 | ✗ | static inline void RENAME(prefetchnta)(const void *p) | |
2801 | { | ||
2802 | ✗ | __asm__ volatile( "prefetchnta (%0)\n\t" | |
2803 | : : "r" (p) | ||
2804 | ); | ||
2805 | ✗ | } | |
2806 | |||
2807 | ✗ | static inline void RENAME(prefetcht0)(const void *p) | |
2808 | { | ||
2809 | ✗ | __asm__ volatile( "prefetcht0 (%0)\n\t" | |
2810 | : : "r" (p) | ||
2811 | ); | ||
2812 | ✗ | } | |
2813 | |||
2814 | static inline void RENAME(prefetcht1)(const void *p) | ||
2815 | { | ||
2816 | __asm__ volatile( "prefetcht1 (%0)\n\t" | ||
2817 | : : "r" (p) | ||
2818 | ); | ||
2819 | } | ||
2820 | |||
2821 | static inline void RENAME(prefetcht2)(const void *p) | ||
2822 | { | ||
2823 | __asm__ volatile( "prefetcht2 (%0)\n\t" | ||
2824 | : : "r" (p) | ||
2825 | ); | ||
2826 | } | ||
2827 | #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2) | ||
2828 | static inline void RENAME(prefetchnta)(const void *p) | ||
2829 | { | ||
2830 | __builtin_prefetch(p,0,0); | ||
2831 | } | ||
2832 | static inline void RENAME(prefetcht0)(const void *p) | ||
2833 | { | ||
2834 | __builtin_prefetch(p,0,1); | ||
2835 | } | ||
2836 | static inline void RENAME(prefetcht1)(const void *p) | ||
2837 | { | ||
2838 | __builtin_prefetch(p,0,2); | ||
2839 | } | ||
2840 | static inline void RENAME(prefetcht2)(const void *p) | ||
2841 | { | ||
2842 | __builtin_prefetch(p,0,3); | ||
2843 | } | ||
2844 | #else | ||
2845 | 197120 | static inline void RENAME(prefetchnta)(const void *p) | |
2846 | { | ||
2847 | 197120 | return; | |
2848 | } | ||
2849 | 197120 | static inline void RENAME(prefetcht0)(const void *p) | |
2850 | { | ||
2851 | 197120 | return; | |
2852 | } | ||
2853 | static inline void RENAME(prefetcht1)(const void *p) | ||
2854 | { | ||
2855 | return; | ||
2856 | } | ||
2857 | static inline void RENAME(prefetcht2)(const void *p) | ||
2858 | { | ||
2859 | return; | ||
2860 | } | ||
2861 | #endif | ||
2862 | /** | ||
2863 | * Filter array of bytes (Y or U or V values) | ||
2864 | */ | ||
2865 | 240 | static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |
2866 | const int8_t QPs[], int QPStride, int isColor, PPContext *c) | ||
2867 | { | ||
2868 | int x,y; | ||
2869 | #ifdef TEMPLATE_PP_TIME_MODE | ||
2870 | const int mode= TEMPLATE_PP_TIME_MODE; | ||
2871 | #else | ||
2872 |
2/2✓ Branch 0 taken 80 times.
✓ Branch 1 taken 40 times.
|
240 | const int mode = isColor ? c->ppMode.chromMode : c->ppMode.lumMode; |
2873 | #endif | ||
2874 | 240 | int black=0, white=255; // blackest black and whitest white in the picture | |
2875 | 240 | int QPCorrecture= 256*256; | |
2876 | |||
2877 | int copyAhead; | ||
2878 | #if TEMPLATE_PP_MMX | ||
2879 | int i; | ||
2880 | #endif | ||
2881 | |||
2882 |
2/2✓ Branch 0 taken 80 times.
✓ Branch 1 taken 40 times.
|
240 | const int qpHShift = isColor ? 4 - c->hChromaSubSample : 4; |
2883 |
2/2✓ Branch 0 taken 80 times.
✓ Branch 1 taken 40 times.
|
240 | const int qpVShift = isColor ? 4 - c->vChromaSubSample : 4; |
2884 | |||
2885 | //FIXME remove | ||
2886 | 240 | uint64_t * const yHistogram= c->yHistogram; | |
2887 |
1/2✓ Branch 0 taken 120 times.
✗ Branch 1 not taken.
|
240 | uint8_t * const tempSrc = srcStride > 0 ? c->tempSrc : c->tempSrc - 23*srcStride; |
2888 |
1/2✓ Branch 0 taken 120 times.
✗ Branch 1 not taken.
|
240 | uint8_t * const tempDst = (dstStride > 0 ? c->tempDst : c->tempDst - 23*dstStride) + 32; |
2889 | //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; | ||
2890 | |||
2891 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 120 times.
|
240 | if (mode & VISUALIZE){ |
2892 | ✗ | if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) { | |
2893 | ✗ | av_log(c, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n"); | |
2894 | } | ||
2895 | } | ||
2896 | |||
2897 | #if TEMPLATE_PP_MMX | ||
2898 | ✗ | for(i=0; i<57; i++){ | |
2899 | ✗ | int offset = ((i * c->ppMode.baseDcDiff) >> 8) + 1; | |
2900 | ✗ | int threshold= offset*2 + 1; | |
2901 | ✗ | c->mmxDcOffset[i] = 0x7F - offset; | |
2902 | ✗ | c->mmxDcThreshold[i] = 0x7F - threshold; | |
2903 | ✗ | c->mmxDcOffset[i] *= 0x0101010101010101LL; | |
2904 | ✗ | c->mmxDcThreshold[i] *= 0x0101010101010101LL; | |
2905 | } | ||
2906 | #endif | ||
2907 | |||
2908 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 105 times.
|
240 | if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
2909 |
2/2✓ Branch 0 taken 90 times.
✓ Branch 1 taken 15 times.
|
210 | else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
2910 |
2/2✓ Branch 0 taken 75 times.
✓ Branch 1 taken 15 times.
|
180 | || (mode & FFMPEG_DEINT_FILTER) |
2911 |
2/2✓ Branch 0 taken 45 times.
✓ Branch 1 taken 30 times.
|
210 | || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; |
2912 |
1/2✓ Branch 0 taken 30 times.
✗ Branch 1 not taken.
|
60 | else if( (mode & V_DEBLOCK) |
2913 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 15 times.
|
60 | || (mode & LINEAR_IPOL_DEINT_FILTER) |
2914 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 15 times.
|
30 | || (mode & MEDIAN_DEINT_FILTER) |
2915 |
0/2✗ Branch 0 not taken.
✗ Branch 1 not taken.
|
60 | || (mode & V_A_DEBLOCK)) copyAhead=13; |
2916 | ✗ | else if(mode & V_X1_FILTER) copyAhead=11; | |
2917 | // else if(mode & V_RK1_FILTER) copyAhead=10; | ||
2918 | ✗ | else if(mode & DERING) copyAhead=9; | |
2919 | ✗ | else copyAhead=8; | |
2920 | |||
2921 | 240 | copyAhead-= 8; | |
2922 | |||
2923 |
2/2✓ Branch 0 taken 40 times.
✓ Branch 1 taken 80 times.
|
240 | if(!isColor){ |
2924 | 80 | uint64_t sum= 0; | |
2925 | int i; | ||
2926 | uint64_t maxClipped; | ||
2927 | uint64_t clipped; | ||
2928 | AVRational scale; | ||
2929 | |||
2930 | 80 | c->frameNum++; | |
2931 | // first frame is fscked so we ignore it | ||
2932 |
2/2✓ Branch 0 taken 8 times.
✓ Branch 1 taken 32 times.
|
80 | if (c->frameNum == 1) |
2933 | 16 | yHistogram[0] = width * (uint64_t)height/64*15/256; | |
2934 | |||
2935 |
2/2✓ Branch 0 taken 10240 times.
✓ Branch 1 taken 40 times.
|
20560 | for(i=0; i<256; i++){ |
2936 | 20480 | sum+= yHistogram[i]; | |
2937 | } | ||
2938 | |||
2939 | /* We always get a completely black picture first. */ | ||
2940 | 80 | maxClipped = av_rescale(sum, c->ppMode.maxClippedThreshold.num, | |
2941 | 80 | c->ppMode.maxClippedThreshold.den); | |
2942 | |||
2943 | 80 | clipped= sum; | |
2944 |
1/2✓ Branch 0 taken 10184 times.
✗ Branch 1 not taken.
|
20368 | for(black=255; black>0; black--){ |
2945 |
2/2✓ Branch 0 taken 40 times.
✓ Branch 1 taken 10144 times.
|
20368 | if(clipped < maxClipped) break; |
2946 | 20288 | clipped-= yHistogram[black]; | |
2947 | } | ||
2948 | |||
2949 | 80 | clipped= sum; | |
2950 |
1/2✓ Branch 0 taken 10184 times.
✗ Branch 1 not taken.
|
20368 | for(white=0; white<256; white++){ |
2951 |
2/2✓ Branch 0 taken 40 times.
✓ Branch 1 taken 10144 times.
|
20368 | if(clipped < maxClipped) break; |
2952 | 20288 | clipped-= yHistogram[white]; | |
2953 | } | ||
2954 | |||
2955 | 80 | scale = (AVRational){c->ppMode.maxAllowedY - c->ppMode.minAllowedY, white - black}; | |
2956 | |||
2957 | #if TEMPLATE_PP_MMXEXT | ||
2958 | ✗ | c->packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den); | |
2959 | ✗ | c->packedYOffset = (((black*c->packedYScale)>>8) - c->ppMode.minAllowedY) & 0xFFFF; | |
2960 | #else | ||
2961 | 80 | c->packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den); | |
2962 | 80 | c->packedYOffset = (black - c->ppMode.minAllowedY) & 0xFFFF; | |
2963 | #endif | ||
2964 | |||
2965 | 80 | c->packedYOffset |= c->packedYOffset<<32; | |
2966 | 80 | c->packedYOffset |= c->packedYOffset<<16; | |
2967 | |||
2968 | 80 | c->packedYScale |= c->packedYScale<<32; | |
2969 | 80 | c->packedYScale |= c->packedYScale<<16; | |
2970 | |||
2971 |
2/2✓ Branch 0 taken 15 times.
✓ Branch 1 taken 25 times.
|
80 | if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den); |
2972 | 50 | else QPCorrecture= 256*256; | |
2973 | }else{ | ||
2974 | 160 | c->packedYScale = 0x0100010001000100LL; | |
2975 | 160 | c->packedYOffset = 0; | |
2976 | 160 | QPCorrecture= 256*256; | |
2977 | } | ||
2978 | |||
2979 | /* copy & deinterlace first row of blocks */ | ||
2980 | 240 | y=-BLOCK_SIZE; | |
2981 | { | ||
2982 | 240 | const uint8_t *srcBlock= &(src[y*srcStride]); | |
2983 | 240 | uint8_t *dstBlock= tempDst + dstStride; | |
2984 | |||
2985 | // From this point on it is guaranteed that we can read and write 16 lines downward | ||
2986 | // finish 1 block before the next otherwise we might have a problem | ||
2987 | // with the L1 Cache of the P4 ... or only a few blocks at a time or something | ||
2988 |
2/2✓ Branch 0 taken 3520 times.
✓ Branch 1 taken 120 times.
|
7280 | for(x=0; x<width; x+=BLOCK_SIZE){ |
2989 | 7040 | RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); | |
2990 | 7040 | RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); | |
2991 | 7040 | RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); | |
2992 | 7040 | RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); | |
2993 | |||
2994 | 7040 | RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, | |
2995 | 7040 | srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c->packedYOffset); | |
2996 | |||
2997 | 7040 | RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
2998 | |||
2999 |
2/2✓ Branch 0 taken 440 times.
✓ Branch 1 taken 3080 times.
|
7040 | if(mode & LINEAR_IPOL_DEINT_FILTER) |
3000 | 880 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | |
3001 |
2/2✓ Branch 0 taken 440 times.
✓ Branch 1 taken 2640 times.
|
6160 | else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3002 | 880 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x); | |
3003 |
2/2✓ Branch 0 taken 440 times.
✓ Branch 1 taken 2200 times.
|
5280 | else if(mode & MEDIAN_DEINT_FILTER) |
3004 | 880 | RENAME(deInterlaceMedian)(dstBlock, dstStride); | |
3005 |
2/2✓ Branch 0 taken 440 times.
✓ Branch 1 taken 1760 times.
|
4400 | else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3006 | 880 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | |
3007 |
2/2✓ Branch 0 taken 440 times.
✓ Branch 1 taken 1320 times.
|
3520 | else if(mode & FFMPEG_DEINT_FILTER) |
3008 | 880 | RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x); | |
3009 |
1/2✓ Branch 0 taken 1320 times.
✗ Branch 1 not taken.
|
2640 | else if(mode & LOWPASS5_DEINT_FILTER) |
3010 | 2640 | RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x); | |
3011 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | ||
3012 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | ||
3013 | */ | ||
3014 | 7040 | dstBlock+=8; | |
3015 | 7040 | srcBlock+=8; | |
3016 | } | ||
3017 |
1/2✓ Branch 0 taken 120 times.
✗ Branch 1 not taken.
|
240 | if(width==FFABS(dstStride)) |
3018 | 240 | linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); | |
3019 | else{ | ||
3020 | int i; | ||
3021 | ✗ | for(i=0; i<copyAhead; i++){ | |
3022 | ✗ | memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
3023 | } | ||
3024 | } | ||
3025 | } | ||
3026 | |||
3027 |
2/2✓ Branch 0 taken 2880 times.
✓ Branch 1 taken 120 times.
|
6000 | for(y=0; y<height; y+=BLOCK_SIZE){ |
3028 | //1% speedup if these are here instead of the inner loop | ||
3029 | 5760 | const uint8_t *srcBlock= &(src[y*srcStride]); | |
3030 | 5760 | uint8_t *dstBlock= &(dst[y*dstStride]); | |
3031 | #if TEMPLATE_PP_MMX | ||
3032 | ✗ | uint8_t *tempBlock1 = c->tempBlocks; | |
3033 | ✗ | uint8_t *tempBlock2 = c->tempBlocks + 8; | |
3034 | #endif | ||
3035 | 5760 | const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; | |
3036 | 5760 | int8_t *nonBQPptr = &c->nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; | |
3037 | 5760 | int QP=0, nonBQP=0; | |
3038 | /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | ||
3039 | if not than use a temporary buffer */ | ||
3040 |
2/2✓ Branch 0 taken 120 times.
✓ Branch 1 taken 2760 times.
|
5760 | if(y+15 >= height){ |
3041 | int i; | ||
3042 | /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with | ||
3043 | blockcopy to dst later */ | ||
3044 | 240 | linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, | |
3045 | 240 | FFMAX(height-y-copyAhead, 0), srcStride); | |
3046 | |||
3047 | /* duplicate last line of src to fill the void up to line (copyAhead+7) */ | ||
3048 |
2/2✓ Branch 0 taken 720 times.
✓ Branch 1 taken 120 times.
|
1680 | for(i=FFMAX(height-y, 8); i<copyAhead+8; i++) |
3049 | 1440 | memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); | |
3050 | |||
3051 | /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ | ||
3052 |
2/2✓ Branch 0 taken 105 times.
✓ Branch 1 taken 15 times.
|
240 | linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); |
3053 | |||
3054 | /* duplicate last line of dst to fill the void up to line (copyAhead) */ | ||
3055 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 120 times.
|
240 | for(i=height-y+1; i<=copyAhead; i++) |
3056 | ✗ | memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); | |
3057 | |||
3058 | 240 | dstBlock= tempDst + dstStride; | |
3059 | 240 | srcBlock= tempSrc; | |
3060 | } | ||
3061 | |||
3062 | // From this point on it is guaranteed that we can read and write 16 lines downward | ||
3063 | // finish 1 block before the next otherwise we might have a problem | ||
3064 | // with the L1 Cache of the P4 ... or only a few blocks at a time or something | ||
3065 |
2/2✓ Branch 0 taken 24480 times.
✓ Branch 1 taken 2880 times.
|
54720 | for(x=0; x<width; ){ |
3066 | 48960 | int startx = x; | |
3067 | 48960 | int endx = FFMIN(width, x+32); | |
3068 | 48960 | uint8_t *dstBlockStart = dstBlock; | |
3069 | 48960 | const uint8_t *srcBlockStart = srcBlock; | |
3070 | 48960 | int qp_index = 0; | |
3071 |
2/2✓ Branch 0 taken 95040 times.
✓ Branch 1 taken 24480 times.
|
239040 | for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){ |
3072 | 190080 | QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift]; | |
3073 | 190080 | nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift]; | |
3074 |
2/2✓ Branch 0 taken 63360 times.
✓ Branch 1 taken 31680 times.
|
190080 | if(!isColor){ |
3075 | 126720 | QP= (QP* QPCorrecture + 256*128)>>16; | |
3076 | 126720 | nonBQP= (nonBQP* QPCorrecture + 256*128)>>16; | |
3077 | 126720 | yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++; | |
3078 | } | ||
3079 | 190080 | c->QP_block[qp_index] = QP; | |
3080 | 190080 | c->nonBQP_block[qp_index] = nonBQP; | |
3081 | #if TEMPLATE_PP_MMX | ||
3082 | ✗ | __asm__ volatile( | |
3083 | "movd %1, %%mm7 \n\t" | ||
3084 | "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | ||
3085 | "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | ||
3086 | "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | ||
3087 | "movq %%mm7, %0 \n\t" | ||
3088 | : "=m" (c->pQPb_block[qp_index]) | ||
3089 | : "r" (QP) | ||
3090 | ); | ||
3091 | #endif | ||
3092 | } | ||
3093 |
2/2✓ Branch 0 taken 95040 times.
✓ Branch 1 taken 24480 times.
|
239040 | for(; x < endx; x+=BLOCK_SIZE){ |
3094 | 190080 | RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32); | |
3095 | 190080 | RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32); | |
3096 | 190080 | RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32); | |
3097 | 190080 | RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32); | |
3098 | |||
3099 | 190080 | RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, | |
3100 | 190080 | srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c->packedYOffset); | |
3101 | |||
3102 |
2/2✓ Branch 0 taken 11880 times.
✓ Branch 1 taken 83160 times.
|
190080 | if(mode & LINEAR_IPOL_DEINT_FILTER) |
3103 | 23760 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); | |
3104 |
2/2✓ Branch 0 taken 11880 times.
✓ Branch 1 taken 71280 times.
|
166320 | else if(mode & LINEAR_BLEND_DEINT_FILTER) |
3105 | 23760 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c->deintTemp + x); | |
3106 |
2/2✓ Branch 0 taken 11880 times.
✓ Branch 1 taken 59400 times.
|
142560 | else if(mode & MEDIAN_DEINT_FILTER) |
3107 | 23760 | RENAME(deInterlaceMedian)(dstBlock, dstStride); | |
3108 |
2/2✓ Branch 0 taken 11880 times.
✓ Branch 1 taken 47520 times.
|
118800 | else if(mode & CUBIC_IPOL_DEINT_FILTER) |
3109 | 23760 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); | |
3110 |
2/2✓ Branch 0 taken 11880 times.
✓ Branch 1 taken 35640 times.
|
95040 | else if(mode & FFMPEG_DEINT_FILTER) |
3111 | 23760 | RENAME(deInterlaceFF)(dstBlock, dstStride, c->deintTemp + x); | |
3112 |
1/2✓ Branch 0 taken 35640 times.
✗ Branch 1 not taken.
|
71280 | else if(mode & LOWPASS5_DEINT_FILTER) |
3113 | 71280 | RENAME(deInterlaceL5)(dstBlock, dstStride, c->deintTemp + x, c->deintTemp + width + x); | |
3114 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | ||
3115 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); | ||
3116 | */ | ||
3117 | 190080 | dstBlock+=8; | |
3118 | 190080 | srcBlock+=8; | |
3119 | } | ||
3120 | |||
3121 | 48960 | dstBlock = dstBlockStart; | |
3122 | 48960 | srcBlock = srcBlockStart; | |
3123 | |||
3124 |
2/2✓ Branch 0 taken 95040 times.
✓ Branch 1 taken 24480 times.
|
239040 | for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){ |
3125 | 190080 | const int stride= dstStride; | |
3126 | //temporary while changing QP stuff to make things continue to work | ||
3127 | //eventually QP,nonBQP,etc will be arrays and this will be unnecessary | ||
3128 | 190080 | c->QP = c->QP_block[qp_index]; | |
3129 | 190080 | c->nonBQP = c->nonBQP_block[qp_index]; | |
3130 | 190080 | c->pQPb = c->pQPb_block[qp_index]; | |
3131 | 190080 | c->pQPb2 = c->pQPb2_block[qp_index]; | |
3132 | |||
3133 | /* only deblock if we have 2 blocks */ | ||
3134 |
2/2✓ Branch 0 taken 91520 times.
✓ Branch 1 taken 3520 times.
|
190080 | if(y + 8 < height){ |
3135 |
2/2✓ Branch 0 taken 11440 times.
✓ Branch 1 taken 80080 times.
|
183040 | if(mode & V_X1_FILTER) |
3136 | 22880 | RENAME(vertX1Filter)(dstBlock, stride, c); | |
3137 |
2/2✓ Branch 0 taken 34320 times.
✓ Branch 1 taken 45760 times.
|
160160 | else if(mode & V_DEBLOCK){ |
3138 | 68640 | const int t = RENAME(vertClassify)(dstBlock, stride, c); | |
3139 | |||
3140 |
2/2✓ Branch 0 taken 85 times.
✓ Branch 1 taken 34235 times.
|
68640 | if(t==1) |
3141 | 170 | RENAME(doVertLowPass)(dstBlock, stride, c); | |
3142 |
2/2✓ Branch 0 taken 33968 times.
✓ Branch 1 taken 267 times.
|
68470 | else if(t==2) |
3143 | 67936 | RENAME(doVertDefFilter)(dstBlock, stride, c); | |
3144 |
2/2✓ Branch 0 taken 11440 times.
✓ Branch 1 taken 34320 times.
|
91520 | }else if(mode & V_A_DEBLOCK){ |
3145 | 22880 | RENAME(do_a_deblock)(dstBlock, stride, 1, c, mode); | |
3146 | } | ||
3147 | } | ||
3148 | |||
3149 | 190080 | dstBlock+=8; | |
3150 | 190080 | srcBlock+=8; | |
3151 | } | ||
3152 | |||
3153 | 48960 | dstBlock = dstBlockStart; | |
3154 | 48960 | srcBlock = srcBlockStart; | |
3155 | |||
3156 |
2/2✓ Branch 0 taken 95040 times.
✓ Branch 1 taken 24480 times.
|
239040 | for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){ |
3157 | 190080 | const int stride= dstStride; | |
3158 | 190080 | c->QP = c->QP_block[qp_index]; | |
3159 | 190080 | c->nonBQP = c->nonBQP_block[qp_index]; | |
3160 | 190080 | c->pQPb = c->pQPb_block[qp_index]; | |
3161 | 190080 | c->pQPb2 = c->pQPb2_block[qp_index]; | |
3162 | #if TEMPLATE_PP_MMX | ||
3163 | ✗ | RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); | |
3164 | #endif | ||
3165 | /* check if we have a previous block to deblock it with dstBlock */ | ||
3166 |
2/2✓ Branch 0 taken 92160 times.
✓ Branch 1 taken 2880 times.
|
190080 | if(x - 8 >= 0){ |
3167 | #if TEMPLATE_PP_MMX | ||
3168 | ✗ | if(mode & H_X1_FILTER) | |
3169 | ✗ | RENAME(vertX1Filter)(tempBlock1, 16, c); | |
3170 | ✗ | else if(mode & H_DEBLOCK){ | |
3171 | ✗ | const int t= RENAME(vertClassify)(tempBlock1, 16, c); | |
3172 | ✗ | if(t==1) | |
3173 | ✗ | RENAME(doVertLowPass)(tempBlock1, 16, c); | |
3174 | ✗ | else if(t==2) | |
3175 | ✗ | RENAME(doVertDefFilter)(tempBlock1, 16, c); | |
3176 | ✗ | }else if(mode & H_A_DEBLOCK){ | |
3177 | ✗ | RENAME(do_a_deblock)(tempBlock1, 16, 1, c, mode); | |
3178 | } | ||
3179 | |||
3180 | ✗ | RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); | |
3181 | |||
3182 | #else | ||
3183 |
2/2✓ Branch 0 taken 11520 times.
✓ Branch 1 taken 80640 times.
|
184320 | if(mode & H_X1_FILTER) |
3184 | 23040 | horizX1Filter(dstBlock-4, stride, c->QP); | |
3185 |
2/2✓ Branch 0 taken 34560 times.
✓ Branch 1 taken 46080 times.
|
161280 | else if(mode & H_DEBLOCK){ |
3186 | #if TEMPLATE_PP_ALTIVEC | ||
3187 | DECLARE_ALIGNED(16, unsigned char, tempBlock)[272]; | ||
3188 | int t; | ||
3189 | transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); | ||
3190 | |||
3191 | t = vertClassify_altivec(tempBlock-48, 16, c); | ||
3192 | if(t==1) { | ||
3193 | doVertLowPass_altivec(tempBlock-48, 16, c); | ||
3194 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | ||
3195 | } | ||
3196 | else if(t==2) { | ||
3197 | doVertDefFilter_altivec(tempBlock-48, 16, c); | ||
3198 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | ||
3199 | } | ||
3200 | #else | ||
3201 | 69120 | const int t= RENAME(horizClassify)(dstBlock-4, stride, c); | |
3202 | |||
3203 |
2/2✓ Branch 0 taken 129 times.
✓ Branch 1 taken 34431 times.
|
69120 | if(t==1) |
3204 | 258 | RENAME(doHorizLowPass)(dstBlock-4, stride, c); | |
3205 |
2/2✓ Branch 0 taken 33859 times.
✓ Branch 1 taken 572 times.
|
68862 | else if(t==2) |
3206 | 67718 | RENAME(doHorizDefFilter)(dstBlock-4, stride, c); | |
3207 | #endif | ||
3208 |
2/2✓ Branch 0 taken 11520 times.
✓ Branch 1 taken 34560 times.
|
92160 | }else if(mode & H_A_DEBLOCK){ |
3209 | 23040 | RENAME(do_a_deblock)(dstBlock-8, 1, stride, c, mode); | |
3210 | } | ||
3211 | #endif //TEMPLATE_PP_MMX | ||
3212 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 92160 times.
|
184320 | if(mode & DERING){ |
3213 | //FIXME filter first line | ||
3214 | ✗ | if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, c); | |
3215 | } | ||
3216 | |||
3217 |
2/2✓ Branch 0 taken 34560 times.
✓ Branch 1 taken 57600 times.
|
184320 | if(mode & TEMP_NOISE_FILTER) |
3218 | { | ||
3219 | 69120 | RENAME(tempNoiseReducer)(dstBlock-8, stride, | |
3220 | 69120 | c->tempBlurred[isColor] + y*dstStride + x, | |
3221 | 69120 | c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, | |
3222 | 69120 | c->ppMode.maxTmpNoise); | |
3223 | } | ||
3224 | } | ||
3225 | |||
3226 | 190080 | dstBlock+=8; | |
3227 | 190080 | srcBlock+=8; | |
3228 | |||
3229 | #if TEMPLATE_PP_MMX | ||
3230 | ✗ | FFSWAP(uint8_t *, tempBlock1, tempBlock2); | |
3231 | #endif | ||
3232 | } | ||
3233 | } | ||
3234 | |||
3235 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 2880 times.
|
5760 | if(mode & DERING){ |
3236 | ✗ | if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, c); | |
3237 | } | ||
3238 | |||
3239 |
2/2✓ Branch 0 taken 1080 times.
✓ Branch 1 taken 1800 times.
|
5760 | if((mode & TEMP_NOISE_FILTER)){ |
3240 | 2160 | RENAME(tempNoiseReducer)(dstBlock-8, dstStride, | |
3241 | 2160 | c->tempBlurred[isColor] + y*dstStride + x, | |
3242 | 2160 | c->tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, | |
3243 | 2160 | c->ppMode.maxTmpNoise); | |
3244 | } | ||
3245 | |||
3246 | /* did we use a tmp buffer for the last lines*/ | ||
3247 |
2/2✓ Branch 0 taken 120 times.
✓ Branch 1 taken 2760 times.
|
5760 | if(y+15 >= height){ |
3248 | 240 | uint8_t *dstBlock= &(dst[y*dstStride]); | |
3249 |
1/2✓ Branch 0 taken 120 times.
✗ Branch 1 not taken.
|
240 | if(width==FFABS(dstStride)) |
3250 | 240 | linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); | |
3251 | else{ | ||
3252 | int i; | ||
3253 | ✗ | for(i=0; i<height-y; i++){ | |
3254 | ✗ | memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
3255 | } | ||
3256 | } | ||
3257 | } | ||
3258 | } | ||
3259 | #if TEMPLATE_PP_MMX | ||
3260 | ✗ | __asm__ volatile("emms"); | |
3261 | #endif | ||
3262 | |||
3263 | #ifdef DEBUG_BRIGHTNESS | ||
3264 | if(!isColor){ | ||
3265 | int max=1; | ||
3266 | int i; | ||
3267 | for(i=0; i<256; i++) | ||
3268 | if(yHistogram[i] > max) max=yHistogram[i]; | ||
3269 | |||
3270 | for(i=1; i<256; i++){ | ||
3271 | int x; | ||
3272 | int start=yHistogram[i-1]/(max/256+1); | ||
3273 | int end=yHistogram[i]/(max/256+1); | ||
3274 | int inc= end > start ? 1 : -1; | ||
3275 | for(x=start; x!=end+inc; x+=inc) | ||
3276 | dst[ i*dstStride + x]+=128; | ||
3277 | } | ||
3278 | |||
3279 | for(i=0; i<100; i+=2){ | ||
3280 | dst[ (white)*dstStride + i]+=128; | ||
3281 | dst[ (black)*dstStride + i]+=128; | ||
3282 | } | ||
3283 | } | ||
3284 | #endif | ||
3285 | 240 | } | |
3286 | |||
3287 | #undef RENAME | ||
3288 | #undef TEMPLATE_PP_C | ||
3289 | #undef TEMPLATE_PP_ALTIVEC | ||
3290 | #undef TEMPLATE_PP_MMX | ||
3291 | #undef TEMPLATE_PP_MMXEXT | ||
3292 | #undef TEMPLATE_PP_SSE2 | ||
3293 |