1 |
|
|
/* |
2 |
|
|
* x86 optimized discrete wavelet transform |
3 |
|
|
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
4 |
|
|
* Copyright (c) 2010 David Conrad |
5 |
|
|
* |
6 |
|
|
* This file is part of FFmpeg. |
7 |
|
|
* |
8 |
|
|
* FFmpeg is free software; you can redistribute it and/or |
9 |
|
|
* modify it under the terms of the GNU Lesser General Public |
10 |
|
|
* License as published by the Free Software Foundation; either |
11 |
|
|
* version 2.1 of the License, or (at your option) any later version. |
12 |
|
|
* |
13 |
|
|
* FFmpeg is distributed in the hope that it will be useful, |
14 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
16 |
|
|
* Lesser General Public License for more details. |
17 |
|
|
* |
18 |
|
|
* You should have received a copy of the GNU Lesser General Public |
19 |
|
|
* License along with FFmpeg; if not, write to the Free Software |
20 |
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
21 |
|
|
*/ |
22 |
|
|
|
23 |
|
|
#include "libavutil/x86/asm.h" |
24 |
|
|
#include "libavutil/x86/cpu.h" |
25 |
|
|
#include "libavcodec/dirac_dwt.h" |
26 |
|
|
|
27 |
|
|
#define COMPOSE_VERTICAL(ext, align) \ |
28 |
|
|
void ff_vertical_compose53iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \ |
29 |
|
|
void ff_vertical_compose_dirac53iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int width); \ |
30 |
|
|
void ff_vertical_compose_dd137iL0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \ |
31 |
|
|
void ff_vertical_compose_dd97iH0##ext(int16_t *b0, int16_t *b1, int16_t *b2, int16_t *b3, int16_t *b4, int width); \ |
32 |
|
|
void ff_vertical_compose_haar##ext(int16_t *b0, int16_t *b1, int width); \ |
33 |
|
|
void ff_horizontal_compose_haar0i##ext(int16_t *b, int16_t *tmp, int w);\ |
34 |
|
|
void ff_horizontal_compose_haar1i##ext(int16_t *b, int16_t *tmp, int w);\ |
35 |
|
|
\ |
36 |
|
|
static void vertical_compose53iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \ |
37 |
|
|
{ \ |
38 |
|
|
int i, width_align = width&~(align-1); \ |
39 |
|
|
int16_t *b0 = (int16_t *)_b0; \ |
40 |
|
|
int16_t *b1 = (int16_t *)_b1; \ |
41 |
|
|
int16_t *b2 = (int16_t *)_b2; \ |
42 |
|
|
\ |
43 |
|
|
for(i=width_align; i<width; i++) \ |
44 |
|
|
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ |
45 |
|
|
\ |
46 |
|
|
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ |
47 |
|
|
} \ |
48 |
|
|
\ |
49 |
|
|
static void vertical_compose_dirac53iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, int width) \ |
50 |
|
|
{ \ |
51 |
|
|
int i, width_align = width&~(align-1); \ |
52 |
|
|
int16_t *b0 = (int16_t *)_b0; \ |
53 |
|
|
int16_t *b1 = (int16_t *)_b1; \ |
54 |
|
|
int16_t *b2 = (int16_t *)_b2; \ |
55 |
|
|
\ |
56 |
|
|
for(i=width_align; i<width; i++) \ |
57 |
|
|
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ |
58 |
|
|
\ |
59 |
|
|
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ |
60 |
|
|
} \ |
61 |
|
|
\ |
62 |
|
|
static void vertical_compose_dd137iL0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \ |
63 |
|
|
uint8_t *_b3, uint8_t *_b4, int width) \ |
64 |
|
|
{ \ |
65 |
|
|
int i, width_align = width&~(align-1); \ |
66 |
|
|
int16_t *b0 = (int16_t *)_b0; \ |
67 |
|
|
int16_t *b1 = (int16_t *)_b1; \ |
68 |
|
|
int16_t *b2 = (int16_t *)_b2; \ |
69 |
|
|
int16_t *b3 = (int16_t *)_b3; \ |
70 |
|
|
int16_t *b4 = (int16_t *)_b4; \ |
71 |
|
|
\ |
72 |
|
|
for(i=width_align; i<width; i++) \ |
73 |
|
|
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ |
74 |
|
|
\ |
75 |
|
|
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ |
76 |
|
|
} \ |
77 |
|
|
\ |
78 |
|
|
static void vertical_compose_dd97iH0##ext(uint8_t *_b0, uint8_t *_b1, uint8_t *_b2, \ |
79 |
|
|
uint8_t *_b3, uint8_t *_b4, int width) \ |
80 |
|
|
{ \ |
81 |
|
|
int i, width_align = width&~(align-1); \ |
82 |
|
|
int16_t *b0 = (int16_t *)_b0; \ |
83 |
|
|
int16_t *b1 = (int16_t *)_b1; \ |
84 |
|
|
int16_t *b2 = (int16_t *)_b2; \ |
85 |
|
|
int16_t *b3 = (int16_t *)_b3; \ |
86 |
|
|
int16_t *b4 = (int16_t *)_b4; \ |
87 |
|
|
\ |
88 |
|
|
for(i=width_align; i<width; i++) \ |
89 |
|
|
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ |
90 |
|
|
\ |
91 |
|
|
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ |
92 |
|
|
} \ |
93 |
|
|
static void vertical_compose_haar##ext(uint8_t *_b0, uint8_t *_b1, int width) \ |
94 |
|
|
{ \ |
95 |
|
|
int i, width_align = width&~(align-1); \ |
96 |
|
|
int16_t *b0 = (int16_t *)_b0; \ |
97 |
|
|
int16_t *b1 = (int16_t *)_b1; \ |
98 |
|
|
\ |
99 |
|
|
for(i=width_align; i<width; i++) { \ |
100 |
|
|
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ |
101 |
|
|
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ |
102 |
|
|
} \ |
103 |
|
|
\ |
104 |
|
|
ff_vertical_compose_haar##ext(b0, b1, width_align); \ |
105 |
|
|
} \ |
106 |
|
|
static void horizontal_compose_haar0i##ext(uint8_t *_b, uint8_t *_tmp, int w)\ |
107 |
|
|
{\ |
108 |
|
|
int w2= w>>1;\ |
109 |
|
|
int x= w2 - (w2&(align-1));\ |
110 |
|
|
int16_t *b = (int16_t *)_b; \ |
111 |
|
|
int16_t *tmp = (int16_t *)_tmp; \ |
112 |
|
|
\ |
113 |
|
|
ff_horizontal_compose_haar0i##ext(b, tmp, w);\ |
114 |
|
|
\ |
115 |
|
|
for (; x < w2; x++) {\ |
116 |
|
|
b[2*x ] = tmp[x];\ |
117 |
|
|
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ |
118 |
|
|
}\ |
119 |
|
|
}\ |
120 |
|
|
static void horizontal_compose_haar1i##ext(uint8_t *_b, uint8_t *_tmp, int w)\ |
121 |
|
|
{\ |
122 |
|
|
int w2= w>>1;\ |
123 |
|
|
int x= w2 - (w2&(align-1));\ |
124 |
|
|
int16_t *b = (int16_t *)_b; \ |
125 |
|
|
int16_t *tmp = (int16_t *)_tmp; \ |
126 |
|
|
\ |
127 |
|
|
ff_horizontal_compose_haar1i##ext(b, tmp, w);\ |
128 |
|
|
\ |
129 |
|
|
for (; x < w2; x++) {\ |
130 |
|
|
b[2*x ] = (tmp[x] + 1)>>1;\ |
131 |
|
|
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ |
132 |
|
|
}\ |
133 |
|
|
}\ |
134 |
|
|
\ |
135 |
|
|
|
136 |
|
|
#if HAVE_X86ASM |
137 |
|
|
#if !ARCH_X86_64 |
138 |
|
|
COMPOSE_VERTICAL(_mmx, 4) |
139 |
|
|
#endif |
140 |
|
|
COMPOSE_VERTICAL(_sse2, 8) |
141 |
|
|
|
142 |
|
|
|
143 |
|
|
void ff_horizontal_compose_dd97i_ssse3(int16_t *_b, int16_t *_tmp, int w); |
144 |
|
|
|
145 |
|
|
static void horizontal_compose_dd97i_ssse3(uint8_t *_b, uint8_t *_tmp, int w) |
146 |
|
|
{ |
147 |
|
|
int w2= w>>1; |
148 |
|
|
int x= w2 - (w2&7); |
149 |
|
|
int16_t *b = (int16_t *)_b; |
150 |
|
|
int16_t *tmp = (int16_t *)_tmp; |
151 |
|
|
|
152 |
|
|
ff_horizontal_compose_dd97i_ssse3(b, tmp, w); |
153 |
|
|
|
154 |
|
|
for (; x < w2; x++) { |
155 |
|
|
b[2*x ] = (tmp[x] + 1)>>1; |
156 |
|
|
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; |
157 |
|
|
} |
158 |
|
|
} |
159 |
|
|
#endif |
160 |
|
|
|
161 |
|
348 |
void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type) |
162 |
|
|
{ |
163 |
|
|
#if HAVE_X86ASM |
164 |
|
348 |
int mm_flags = av_get_cpu_flags(); |
165 |
|
|
|
166 |
|
|
#if !ARCH_X86_64 |
167 |
|
|
if (!(mm_flags & AV_CPU_FLAG_MMX)) |
168 |
|
|
return; |
169 |
|
|
|
170 |
|
|
switch (type) { |
171 |
|
|
case DWT_DIRAC_DD9_7: |
172 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; |
173 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; |
174 |
|
|
break; |
175 |
|
|
case DWT_DIRAC_LEGALL5_3: |
176 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; |
177 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; |
178 |
|
|
break; |
179 |
|
|
case DWT_DIRAC_DD13_7: |
180 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; |
181 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; |
182 |
|
|
break; |
183 |
|
|
case DWT_DIRAC_HAAR0: |
184 |
|
|
d->vertical_compose = (void*)vertical_compose_haar_mmx; |
185 |
|
|
d->horizontal_compose = horizontal_compose_haar0i_mmx; |
186 |
|
|
break; |
187 |
|
|
case DWT_DIRAC_HAAR1: |
188 |
|
|
d->vertical_compose = (void*)vertical_compose_haar_mmx; |
189 |
|
|
d->horizontal_compose = horizontal_compose_haar1i_mmx; |
190 |
|
|
break; |
191 |
|
|
} |
192 |
|
|
#endif |
193 |
|
|
|
194 |
✓✗ |
348 |
if (!(mm_flags & AV_CPU_FLAG_SSE2)) |
195 |
|
348 |
return; |
196 |
|
|
|
197 |
|
|
switch (type) { |
198 |
|
|
case DWT_DIRAC_DD9_7: |
199 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; |
200 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; |
201 |
|
|
break; |
202 |
|
|
case DWT_DIRAC_LEGALL5_3: |
203 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; |
204 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; |
205 |
|
|
break; |
206 |
|
|
case DWT_DIRAC_DD13_7: |
207 |
|
|
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; |
208 |
|
|
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; |
209 |
|
|
break; |
210 |
|
|
case DWT_DIRAC_HAAR0: |
211 |
|
|
d->vertical_compose = (void*)vertical_compose_haar_sse2; |
212 |
|
|
d->horizontal_compose = horizontal_compose_haar0i_sse2; |
213 |
|
|
break; |
214 |
|
|
case DWT_DIRAC_HAAR1: |
215 |
|
|
d->vertical_compose = (void*)vertical_compose_haar_sse2; |
216 |
|
|
d->horizontal_compose = horizontal_compose_haar1i_sse2; |
217 |
|
|
break; |
218 |
|
|
} |
219 |
|
|
|
220 |
|
|
if (!(mm_flags & AV_CPU_FLAG_SSSE3)) |
221 |
|
|
return; |
222 |
|
|
|
223 |
|
|
switch (type) { |
224 |
|
|
case DWT_DIRAC_DD9_7: |
225 |
|
|
d->horizontal_compose = horizontal_compose_dd97i_ssse3; |
226 |
|
|
break; |
227 |
|
|
} |
228 |
|
|
#endif // HAVE_X86ASM |
229 |
|
|
} |