Line | Branch | Exec | Source |
---|---|---|---|
1 | /* | ||
2 | * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | ||
3 | * | ||
4 | * This file is part of FFmpeg. | ||
5 | * | ||
6 | * FFmpeg is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * FFmpeg is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License along | ||
17 | * with FFmpeg; if not, write to the Free Software Foundation, Inc., | ||
18 | * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. | ||
19 | */ | ||
20 | |||
21 | |||
22 | #include "libavutil/attributes.h" | ||
23 | #include "libavutil/cpu.h" | ||
24 | #include "libavutil/crc.h" | ||
25 | #include "libavutil/x86/asm.h" | ||
26 | #include "libavfilter/vf_spp.h" | ||
27 | |||
28 | #if HAVE_MMX_INLINE | ||
29 | ✗ | static void hardthresh_mmx(int16_t dst[64], const int16_t src[64], | |
30 | int qp, const uint8_t *permutation) | ||
31 | { | ||
32 | ✗ | int bias = 0; //FIXME | |
33 | unsigned int threshold1; | ||
34 | |||
35 | ✗ | threshold1 = qp * ((1<<4) - bias) - 1; | |
36 | |||
37 | #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ | ||
38 | "movq " #src0 ", %%mm0 \n" \ | ||
39 | "movq " #src1 ", %%mm1 \n" \ | ||
40 | "movq " #src2 ", %%mm2 \n" \ | ||
41 | "movq " #src3 ", %%mm3 \n" \ | ||
42 | "psubw %%mm4, %%mm0 \n" \ | ||
43 | "psubw %%mm4, %%mm1 \n" \ | ||
44 | "psubw %%mm4, %%mm2 \n" \ | ||
45 | "psubw %%mm4, %%mm3 \n" \ | ||
46 | "paddusw %%mm5, %%mm0 \n" \ | ||
47 | "paddusw %%mm5, %%mm1 \n" \ | ||
48 | "paddusw %%mm5, %%mm2 \n" \ | ||
49 | "paddusw %%mm5, %%mm3 \n" \ | ||
50 | "paddw %%mm6, %%mm0 \n" \ | ||
51 | "paddw %%mm6, %%mm1 \n" \ | ||
52 | "paddw %%mm6, %%mm2 \n" \ | ||
53 | "paddw %%mm6, %%mm3 \n" \ | ||
54 | "psubusw %%mm6, %%mm0 \n" \ | ||
55 | "psubusw %%mm6, %%mm1 \n" \ | ||
56 | "psubusw %%mm6, %%mm2 \n" \ | ||
57 | "psubusw %%mm6, %%mm3 \n" \ | ||
58 | "psraw $3, %%mm0 \n" \ | ||
59 | "psraw $3, %%mm1 \n" \ | ||
60 | "psraw $3, %%mm2 \n" \ | ||
61 | "psraw $3, %%mm3 \n" \ | ||
62 | \ | ||
63 | "movq %%mm0, %%mm7 \n" \ | ||
64 | "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ | ||
65 | "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ | ||
66 | "movq %%mm1, %%mm2 \n" \ | ||
67 | "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ | ||
68 | "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ | ||
69 | "movq %%mm0, %%mm3 \n" \ | ||
70 | "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ | ||
71 | "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ | ||
72 | "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ | ||
73 | "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ | ||
74 | \ | ||
75 | "movq %%mm0, " #dst0 " \n" \ | ||
76 | "movq %%mm7, " #dst1 " \n" \ | ||
77 | "movq %%mm3, " #dst2 " \n" \ | ||
78 | "movq %%mm1, " #dst3 " \n" | ||
79 | |||
80 | ✗ | __asm__ volatile( | |
81 | "movd %2, %%mm4 \n" | ||
82 | "movd %3, %%mm5 \n" | ||
83 | "movd %4, %%mm6 \n" | ||
84 | "packssdw %%mm4, %%mm4 \n" | ||
85 | "packssdw %%mm5, %%mm5 \n" | ||
86 | "packssdw %%mm6, %%mm6 \n" | ||
87 | "packssdw %%mm4, %%mm4 \n" | ||
88 | "packssdw %%mm5, %%mm5 \n" | ||
89 | "packssdw %%mm6, %%mm6 \n" | ||
90 | REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) | ||
91 | REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) | ||
92 | REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) | ||
93 | REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) | ||
94 | ✗ | : : "r" (src), "r" (dst), "g" (threshold1+1), "g" (threshold1+5), "g" (threshold1-4) //FIXME maybe more accurate then needed? | |
95 | ); | ||
96 | ✗ | dst[0] = (src[0] + 4) >> 3; | |
97 | ✗ | } | |
98 | |||
99 | ✗ | static void softthresh_mmx(int16_t dst[64], const int16_t src[64], | |
100 | int qp, const uint8_t *permutation) | ||
101 | { | ||
102 | ✗ | int bias = 0; //FIXME | |
103 | unsigned int threshold1; | ||
104 | |||
105 | ✗ | threshold1 = qp*((1<<4) - bias) - 1; | |
106 | |||
107 | #undef REQUANT_CORE | ||
108 | #define REQUANT_CORE(dst0, dst1, dst2, dst3, src0, src1, src2, src3) \ | ||
109 | "movq " #src0 ", %%mm0 \n" \ | ||
110 | "movq " #src1 ", %%mm1 \n" \ | ||
111 | "pxor %%mm6, %%mm6 \n" \ | ||
112 | "pxor %%mm7, %%mm7 \n" \ | ||
113 | "pcmpgtw %%mm0, %%mm6 \n" \ | ||
114 | "pcmpgtw %%mm1, %%mm7 \n" \ | ||
115 | "pxor %%mm6, %%mm0 \n" \ | ||
116 | "pxor %%mm7, %%mm1 \n" \ | ||
117 | "psubusw %%mm4, %%mm0 \n" \ | ||
118 | "psubusw %%mm4, %%mm1 \n" \ | ||
119 | "pxor %%mm6, %%mm0 \n" \ | ||
120 | "pxor %%mm7, %%mm1 \n" \ | ||
121 | "movq " #src2 ", %%mm2 \n" \ | ||
122 | "movq " #src3 ", %%mm3 \n" \ | ||
123 | "pxor %%mm6, %%mm6 \n" \ | ||
124 | "pxor %%mm7, %%mm7 \n" \ | ||
125 | "pcmpgtw %%mm2, %%mm6 \n" \ | ||
126 | "pcmpgtw %%mm3, %%mm7 \n" \ | ||
127 | "pxor %%mm6, %%mm2 \n" \ | ||
128 | "pxor %%mm7, %%mm3 \n" \ | ||
129 | "psubusw %%mm4, %%mm2 \n" \ | ||
130 | "psubusw %%mm4, %%mm3 \n" \ | ||
131 | "pxor %%mm6, %%mm2 \n" \ | ||
132 | "pxor %%mm7, %%mm3 \n" \ | ||
133 | \ | ||
134 | "paddsw %%mm5, %%mm0 \n" \ | ||
135 | "paddsw %%mm5, %%mm1 \n" \ | ||
136 | "paddsw %%mm5, %%mm2 \n" \ | ||
137 | "paddsw %%mm5, %%mm3 \n" \ | ||
138 | "psraw $3, %%mm0 \n" \ | ||
139 | "psraw $3, %%mm1 \n" \ | ||
140 | "psraw $3, %%mm2 \n" \ | ||
141 | "psraw $3, %%mm3 \n" \ | ||
142 | \ | ||
143 | "movq %%mm0, %%mm7 \n" \ | ||
144 | "punpcklwd %%mm2, %%mm0 \n" /*A*/ \ | ||
145 | "punpckhwd %%mm2, %%mm7 \n" /*C*/ \ | ||
146 | "movq %%mm1, %%mm2 \n" \ | ||
147 | "punpcklwd %%mm3, %%mm1 \n" /*B*/ \ | ||
148 | "punpckhwd %%mm3, %%mm2 \n" /*D*/ \ | ||
149 | "movq %%mm0, %%mm3 \n" \ | ||
150 | "punpcklwd %%mm1, %%mm0 \n" /*A*/ \ | ||
151 | "punpckhwd %%mm7, %%mm3 \n" /*C*/ \ | ||
152 | "punpcklwd %%mm2, %%mm7 \n" /*B*/ \ | ||
153 | "punpckhwd %%mm2, %%mm1 \n" /*D*/ \ | ||
154 | \ | ||
155 | "movq %%mm0, " #dst0 " \n" \ | ||
156 | "movq %%mm7, " #dst1 " \n" \ | ||
157 | "movq %%mm3, " #dst2 " \n" \ | ||
158 | "movq %%mm1, " #dst3 " \n" | ||
159 | |||
160 | ✗ | __asm__ volatile( | |
161 | "movd %2, %%mm4 \n" | ||
162 | "movd %3, %%mm5 \n" | ||
163 | "packssdw %%mm4, %%mm4 \n" | ||
164 | "packssdw %%mm5, %%mm5 \n" | ||
165 | "packssdw %%mm4, %%mm4 \n" | ||
166 | "packssdw %%mm5, %%mm5 \n" | ||
167 | REQUANT_CORE( (%1), 8(%1), 16(%1), 24(%1), (%0), 8(%0), 64(%0), 72(%0)) | ||
168 | REQUANT_CORE(32(%1), 40(%1), 48(%1), 56(%1),16(%0),24(%0), 48(%0), 56(%0)) | ||
169 | REQUANT_CORE(64(%1), 72(%1), 80(%1), 88(%1),32(%0),40(%0), 96(%0),104(%0)) | ||
170 | REQUANT_CORE(96(%1),104(%1),112(%1),120(%1),80(%0),88(%0),112(%0),120(%0)) | ||
171 | : : "r" (src), "r" (dst), "g" (threshold1), "rm" (4) //FIXME maybe more accurate then needed? | ||
172 | ); | ||
173 | |||
174 | ✗ | dst[0] = (src[0] + 4) >> 3; | |
175 | ✗ | } | |
176 | |||
177 | ✗ | static void store_slice_mmx(uint8_t *dst, const int16_t *src, | |
178 | int dst_stride, int src_stride, | ||
179 | int width, int height, int log2_scale, | ||
180 | const uint8_t dither[8][8]) | ||
181 | { | ||
182 | int y; | ||
183 | |||
184 | ✗ | for (y = 0; y < height; y++) { | |
185 | ✗ | uint8_t *dst1 = dst; | |
186 | ✗ | const int16_t *src1 = src; | |
187 | ✗ | __asm__ volatile( | |
188 | "movq (%3), %%mm3 \n" | ||
189 | "movq (%3), %%mm4 \n" | ||
190 | "movd %4, %%mm2 \n" | ||
191 | "pxor %%mm0, %%mm0 \n" | ||
192 | "punpcklbw %%mm0, %%mm3 \n" | ||
193 | "punpckhbw %%mm0, %%mm4 \n" | ||
194 | "psraw %%mm2, %%mm3 \n" | ||
195 | "psraw %%mm2, %%mm4 \n" | ||
196 | "movd %5, %%mm2 \n" | ||
197 | "1: \n" | ||
198 | "movq (%0), %%mm0 \n" | ||
199 | "movq 8(%0), %%mm1 \n" | ||
200 | "paddw %%mm3, %%mm0 \n" | ||
201 | "paddw %%mm4, %%mm1 \n" | ||
202 | "psraw %%mm2, %%mm0 \n" | ||
203 | "psraw %%mm2, %%mm1 \n" | ||
204 | "packuswb %%mm1, %%mm0 \n" | ||
205 | "movq %%mm0, (%1) \n" | ||
206 | "add $16, %0 \n" | ||
207 | "add $8, %1 \n" | ||
208 | "cmp %2, %1 \n" | ||
209 | " jb 1b \n" | ||
210 | : "+r" (src1), "+r"(dst1) | ||
211 | ✗ | : "r"(dst + width), "r"(dither[y]), "g"(log2_scale), "g"(MAX_LEVEL - log2_scale) | |
212 | ); | ||
213 | ✗ | src += src_stride; | |
214 | ✗ | dst += dst_stride; | |
215 | } | ||
216 | ✗ | } | |
217 | |||
218 | #endif /* HAVE_MMX_INLINE */ | ||
219 | |||
220 | 1 | av_cold void ff_spp_init_x86(SPPContext *s) | |
221 | { | ||
222 | #if HAVE_MMX_INLINE | ||
223 | 1 | int cpu_flags = av_get_cpu_flags(); | |
224 | |||
225 |
1/2✗ Branch 0 not taken.
✓ Branch 1 taken 1 times.
|
1 | if (cpu_flags & AV_CPU_FLAG_MMX) { |
226 | static const uint32_t mmx_idct_perm_crc = 0xe5e8adc4; | ||
227 | uint32_t idct_perm_crc = | ||
228 | ✗ | av_crc(av_crc_get_table(AV_CRC_32_IEEE), 0, | |
229 | ✗ | s->dct->idct_permutation, | |
230 | sizeof(s->dct->idct_permutation)); | ||
231 | int64_t bps; | ||
232 | ✗ | s->store_slice = store_slice_mmx; | |
233 | ✗ | av_opt_get_int(s->dct, "bits_per_sample", 0, &bps); | |
234 | ✗ | if (bps <= 8 && idct_perm_crc == mmx_idct_perm_crc) { | |
235 | ✗ | switch (s->mode) { | |
236 | ✗ | case 0: s->requantize = hardthresh_mmx; break; | |
237 | ✗ | case 1: s->requantize = softthresh_mmx; break; | |
238 | } | ||
239 | } | ||
240 | } | ||
241 | #endif | ||
242 | 1 | } | |
243 |