LCOV - code coverage report
Current view: top level - libavfilter - vf_fspp.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 344 0.0 %
Date: 2017-12-17 23:02:56 Functions: 0 11 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
       3             :  * Copyright (C) 2005 Nikolaj Poroshin <porosh3@psu.ru>
       4             :  * Copyright (c) 2014 Arwa Arif <arwaarif1994@gmail.com>
       5             :  *
       6             :  * This file is part of FFmpeg.
       7             :  *
       8             :  * FFmpeg is free software; you can redistribute it and/or modify
       9             :  * it under the terms of the GNU General Public License as published by
      10             :  * the Free Software Foundation; either version 2 of the License, or
      11             :  * (at your option) any later version.
      12             :  *
      13             :  * FFmpeg is distributed in the hope that it will be useful,
      14             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      15             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      16             :  * GNU General Public License for more details.
      17             :  *
      18             :  * You should have received a copy of the GNU General Public License along
      19             :  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
      20             :  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      21             :  */
      22             : 
      23             : /**
      24             :  * @file
      25             :  * Fast Simple Post-processing filter
      26             :  * This implementation is based on an algorithm described in
      27             :  * "Aria Nosratinia Embedded Post-Processing for
      28             :  * Enhancement of Compressed Images (1999)"
      29             :  * (http://www.utdallas.edu/~aria/papers/vlsisp99.pdf)
      30             :  * Further, with splitting (I)DCT into horizontal/vertical passes, one of
      31             :  * them can be performed once per block, not per pixel. This allows for much
      32             :  * higher speed.
      33             :  *
      34             :  * Originally written by Michael Niedermayer and Nikolaj for the MPlayer
      35             :  * project, and ported by Arwa Arif for FFmpeg.
      36             :  */
      37             : 
      38             : #include "libavutil/avassert.h"
      39             : #include "libavutil/imgutils.h"
      40             : #include "libavutil/opt.h"
      41             : #include "libavutil/pixdesc.h"
      42             : #include "internal.h"
      43             : #include "vf_fspp.h"
      44             : 
      45             : #define OFFSET(x) offsetof(FSPPContext, x)
      46             : #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
      47             : static const AVOption fspp_options[] = {
      48             :     { "quality",       "set quality",                          OFFSET(log2_count),    AV_OPT_TYPE_INT, {.i64 = 4},   4, MAX_LEVEL, FLAGS },
      49             :     { "qp",            "force a constant quantizer parameter", OFFSET(qp),            AV_OPT_TYPE_INT, {.i64 = 0},   0, 64,        FLAGS },
      50             :     { "strength",      "set filter strength",                  OFFSET(strength),      AV_OPT_TYPE_INT, {.i64 = 0}, -15, 32,        FLAGS },
      51             :     { "use_bframe_qp", "use B-frames' QP",                     OFFSET(use_bframe_qp), AV_OPT_TYPE_BOOL,{.i64 = 0},   0, 1,         FLAGS },
      52             :     { NULL }
      53             : };
      54             : 
      55             : AVFILTER_DEFINE_CLASS(fspp);
      56             : 
      57             : DECLARE_ALIGNED(32, static const uint8_t, dither)[8][8] = {
      58             :     {  0,  48,  12,  60,   3,  51,  15,  63, },
      59             :     { 32,  16,  44,  28,  35,  19,  47,  31, },
      60             :     {  8,  56,   4,  52,  11,  59,   7,  55, },
      61             :     { 40,  24,  36,  20,  43,  27,  39,  23, },
      62             :     {  2,  50,  14,  62,   1,  49,  13,  61, },
      63             :     { 34,  18,  46,  30,  33,  17,  45,  29, },
      64             :     { 10,  58,   6,  54,   9,  57,   5,  53, },
      65             :     { 42,  26,  38,  22,  41,  25,  37,  21, },
      66             : };
      67             : 
      68             : static const short custom_threshold[64] = {
      69             : // values (296) can't be too high
      70             : // -it causes too big quant dependence
      71             : // or maybe overflow(check), which results in some flashing
      72             :      71, 296, 295, 237,  71,  40,  38,  19,
      73             :     245, 193, 185, 121, 102,  73,  53,  27,
      74             :     158, 129, 141, 107,  97,  73,  50,  26,
      75             :     102, 116, 109,  98,  82,  66,  45,  23,
      76             :      71,  94,  95,  81,  70,  56,  38,  20,
      77             :      56,  77,  74,  66,  56,  44,  30,  15,
      78             :      38,  53,  50,  45,  38,  30,  21,  11,
      79             :      20,  27,  26,  23,  20,  15,  11,   5
      80             : };
      81             : 
      82             : //This func reads from 1 slice, 1 and clears 0 & 1
      83           0 : static void store_slice_c(uint8_t *dst, int16_t *src,
      84             :                           ptrdiff_t dst_stride, ptrdiff_t src_stride,
      85             :                           ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
      86             : {
      87             :     int y, x;
      88             : #define STORE(pos)                                                             \
      89             :     temp = (src[x + pos] + (d[pos] >> log2_scale)) >> (6 - log2_scale);        \
      90             :     src[x + pos] = src[x + pos - 8 * src_stride] = 0;                          \
      91             :     if (temp & 0x100) temp = ~(temp >> 31);                                    \
      92             :     dst[x + pos] = temp;
      93             : 
      94           0 :     for (y = 0; y < height; y++) {
      95           0 :         const uint8_t *d = dither[y];
      96           0 :         for (x = 0; x < width; x += 8) {
      97             :             int temp;
      98           0 :             STORE(0);
      99           0 :             STORE(1);
     100           0 :             STORE(2);
     101           0 :             STORE(3);
     102           0 :             STORE(4);
     103           0 :             STORE(5);
     104           0 :             STORE(6);
     105           0 :             STORE(7);
     106             :         }
     107           0 :         src += src_stride;
     108           0 :         dst += dst_stride;
     109             :     }
     110           0 : }
     111             : 
     112             : //This func reads from 2 slices, 0 & 2  and clears 2-nd
     113           0 : static void store_slice2_c(uint8_t *dst, int16_t *src,
     114             :                            ptrdiff_t dst_stride, ptrdiff_t src_stride,
     115             :                            ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
     116             : {
     117             :     int y, x;
     118             : #define STORE2(pos)                                                                                       \
     119             :     temp = (src[x + pos] + src[x + pos + 16 * src_stride] + (d[pos] >> log2_scale)) >> (6 - log2_scale);  \
     120             :     src[x + pos + 16 * src_stride] = 0;                                                                   \
     121             :     if (temp & 0x100) temp = ~(temp >> 31);                                                               \
     122             :     dst[x + pos] = temp;
     123             : 
     124           0 :     for (y = 0; y < height; y++) {
     125           0 :         const uint8_t *d = dither[y];
     126           0 :         for (x = 0; x < width; x += 8) {
     127             :             int temp;
     128           0 :             STORE2(0);
     129           0 :             STORE2(1);
     130           0 :             STORE2(2);
     131           0 :             STORE2(3);
     132           0 :             STORE2(4);
     133           0 :             STORE2(5);
     134           0 :             STORE2(6);
     135           0 :             STORE2(7);
     136             :         }
     137           0 :         src += src_stride;
     138           0 :         dst += dst_stride;
     139             :     }
     140           0 : }
     141             : 
     142           0 : static void mul_thrmat_c(int16_t *thr_adr_noq, int16_t *thr_adr, int q)
     143             : {
     144             :     int a;
     145           0 :     for (a = 0; a < 64; a++)
     146           0 :         thr_adr[a] = q * thr_adr_noq[a];
     147           0 : }
     148             : 
     149           0 : static void filter(FSPPContext *p, uint8_t *dst, uint8_t *src,
     150             :                    int dst_stride, int src_stride,
     151             :                    int width, int height,
     152             :                    uint8_t *qp_store, int qp_stride, int is_luma)
     153             : {
     154             :     int x, x0, y, es, qy, t;
     155             : 
     156           0 :     const int stride = is_luma ? p->temp_stride : (width + 16);
     157           0 :     const int step = 6 - p->log2_count;
     158           0 :     const int qpsh = 4 - p->hsub * !is_luma;
     159           0 :     const int qpsv = 4 - p->vsub * !is_luma;
     160             : 
     161             :     DECLARE_ALIGNED(32, int32_t, block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
     162           0 :     int16_t *block  = (int16_t *)block_align;
     163           0 :     int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
     164             : 
     165           0 :     memset(block3, 0, 4 * 8 * BLOCKSZ);
     166             : 
     167           0 :     if (!src || !dst) return;
     168             : 
     169           0 :     for (y = 0; y < height; y++) {
     170           0 :         int index = 8 + 8 * stride + y * stride;
     171           0 :         memcpy(p->src + index, src + y * src_stride, width);
     172           0 :         for (x = 0; x < 8; x++) {
     173           0 :             p->src[index         - x - 1] = p->src[index +         x    ];
     174           0 :             p->src[index + width + x    ] = p->src[index + width - x - 1];
     175             :         }
     176             :     }
     177             : 
     178           0 :     for (y = 0; y < 8; y++) {
     179           0 :         memcpy(p->src + (     7 - y    ) * stride, p->src + (     y + 8    ) * stride, stride);
     180           0 :         memcpy(p->src + (height + 8 + y) * stride, p->src + (height - y + 7) * stride, stride);
     181             :     }
     182             :     //FIXME (try edge emu)
     183             : 
     184           0 :     for (y = 8; y < 24; y++)
     185           0 :         memset(p->temp + 8 + y * stride, 0, width * sizeof(int16_t));
     186             : 
     187           0 :     for (y = step; y < height + 8; y += step) {    //step= 1,2
     188           0 :         const int y1 = y - 8 + step;                 //l5-7  l4-6;
     189           0 :         qy = y - 4;
     190             : 
     191           0 :         if (qy > height - 1) qy = height - 1;
     192           0 :         if (qy < 0) qy = 0;
     193             : 
     194           0 :         qy = (qy >> qpsv) * qp_stride;
     195           0 :         p->row_fdct(block, p->src + y * stride + 2 - (y&1), stride, 2);
     196             : 
     197           0 :         for (x0 = 0; x0 < width + 8 - 8 * (BLOCKSZ - 1); x0 += 8 * (BLOCKSZ - 1)) {
     198           0 :             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y&1), stride, 2 * (BLOCKSZ - 1));
     199             : 
     200           0 :             if (p->qp)
     201           0 :                 p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + 0 * 8, block3 + 0 * 8, 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
     202             :             else
     203           0 :                 for (x = 0; x < 8 * (BLOCKSZ - 1); x += 8) {
     204           0 :                     t = x + x0 - 2;                    //correct t=x+x0-2-(y&1), but its the same
     205             : 
     206           0 :                     if (t < 0) t = 0;                   //t always < width-2
     207             : 
     208           0 :                     t = qp_store[qy + (t >> qpsh)];
     209           0 :                     t = ff_norm_qscale(t, p->qscale_type);
     210             : 
     211           0 :                     if (t != p->prev_q) p->prev_q = t, p->mul_thrmat((int16_t *)(&p->threshold_mtx_noq[0]), (int16_t *)(&p->threshold_mtx[0]), t);
     212           0 :                     p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block + x * 8, block3 + x * 8, 8); //yes, this is a HOTSPOT
     213             :                 }
     214           0 :             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, 2 * (BLOCKSZ - 1));
     215           0 :             memmove(block,  block  + (BLOCKSZ - 1) * 64, 8 * 8 * sizeof(int16_t)); //cycling
     216           0 :             memmove(block3, block3 + (BLOCKSZ - 1) * 64, 6 * 8 * sizeof(int16_t));
     217             :         }
     218             : 
     219           0 :         es = width + 8 - x0; //  8, ...
     220           0 :         if (es > 8)
     221           0 :             p->row_fdct(block + 8 * 8, p->src + y * stride + 8 + x0 + 2 - (y & 1), stride, (es - 4) >> 2);
     222             : 
     223           0 :         p->column_fidct((int16_t *)(&p->threshold_mtx[0]), block, block3, es&(~1));
     224           0 :         if (es > 3)
     225           0 :             p->row_idct(block3 + 0 * 8, p->temp + (y & 15) * stride + x0 + 2 - (y & 1), stride, es >> 2);
     226             : 
     227           0 :         if (!(y1 & 7) && y1) {
     228           0 :             if (y1 & 8)
     229           0 :                 p->store_slice(dst + (y1 - 8) * dst_stride, p->temp + 8 + 8 * stride,
     230           0 :                                dst_stride, stride, width, 8, 5 - p->log2_count);
     231             :             else
     232           0 :                 p->store_slice2(dst + (y1 - 8) * dst_stride, p->temp + 8 + 0 * stride,
     233           0 :                                 dst_stride, stride, width, 8, 5 - p->log2_count);
     234             :         }
     235             :     }
     236             : 
     237           0 :     if (y & 7) {  // height % 8 != 0
     238           0 :         if (y & 8)
     239           0 :             p->store_slice(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 8 * stride,
     240           0 :                            dst_stride, stride, width, y&7, 5 - p->log2_count);
     241             :         else
     242           0 :             p->store_slice2(dst + ((y - 8) & ~7) * dst_stride, p->temp + 8 + 0 * stride,
     243           0 :                             dst_stride, stride, width, y&7, 5 - p->log2_count);
     244             :     }
     245             : }
     246             : 
     247           0 : static void column_fidct_c(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt)
     248             : {
     249             :     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     250             :     int_simd16_t tmp10, tmp11, tmp12, tmp13;
     251             :     int_simd16_t z1,z2,z3,z4,z5, z10, z11, z12, z13;
     252             :     int_simd16_t d0, d1, d2, d3, d4, d5, d6, d7;
     253             : 
     254             :     int16_t *dataptr;
     255             :     int16_t *wsptr;
     256             :     int16_t *threshold;
     257             :     int ctr;
     258             : 
     259           0 :     dataptr = data;
     260           0 :     wsptr = output;
     261             : 
     262           0 :     for (; cnt > 0; cnt -= 2) { //start positions
     263           0 :         threshold = (int16_t *)thr_adr;//threshold_mtx
     264           0 :         for (ctr = DCTSIZE; ctr > 0; ctr--) {
     265             :             // Process columns from input, add to output.
     266           0 :             tmp0 = dataptr[DCTSIZE * 0] + dataptr[DCTSIZE * 7];
     267           0 :             tmp7 = dataptr[DCTSIZE * 0] - dataptr[DCTSIZE * 7];
     268             : 
     269           0 :             tmp1 = dataptr[DCTSIZE * 1] + dataptr[DCTSIZE * 6];
     270           0 :             tmp6 = dataptr[DCTSIZE * 1] - dataptr[DCTSIZE * 6];
     271             : 
     272           0 :             tmp2 = dataptr[DCTSIZE * 2] + dataptr[DCTSIZE * 5];
     273           0 :             tmp5 = dataptr[DCTSIZE * 2] - dataptr[DCTSIZE * 5];
     274             : 
     275           0 :             tmp3 = dataptr[DCTSIZE * 3] + dataptr[DCTSIZE * 4];
     276           0 :             tmp4 = dataptr[DCTSIZE * 3] - dataptr[DCTSIZE * 4];
     277             : 
     278             :             // Even part of FDCT
     279             : 
     280           0 :             tmp10 = tmp0 + tmp3;
     281           0 :             tmp13 = tmp0 - tmp3;
     282           0 :             tmp11 = tmp1 + tmp2;
     283           0 :             tmp12 = tmp1 - tmp2;
     284             : 
     285           0 :             d0 = tmp10 + tmp11;
     286           0 :             d4 = tmp10 - tmp11;
     287             : 
     288           0 :             z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
     289           0 :             d2 = tmp13 + z1;
     290           0 :             d6 = tmp13 - z1;
     291             : 
     292             :             // Even part of IDCT
     293             : 
     294           0 :             THRESHOLD(tmp0, d0, threshold[0 * 8]);
     295           0 :             THRESHOLD(tmp1, d2, threshold[2 * 8]);
     296           0 :             THRESHOLD(tmp2, d4, threshold[4 * 8]);
     297           0 :             THRESHOLD(tmp3, d6, threshold[6 * 8]);
     298           0 :             tmp0 += 2;
     299           0 :             tmp10 = (tmp0 + tmp2) >> 2;
     300           0 :             tmp11 = (tmp0 - tmp2) >> 2;
     301             : 
     302           0 :             tmp13 = (tmp1 + tmp3) >>2; //+2 !  (psnr decides)
     303           0 :             tmp12 = MULTIPLY16H((tmp1 - tmp3), FIX_1_414213562_A) - tmp13; //<<2
     304             : 
     305           0 :             tmp0 = tmp10 + tmp13; //->temps
     306           0 :             tmp3 = tmp10 - tmp13; //->temps
     307           0 :             tmp1 = tmp11 + tmp12; //->temps
     308           0 :             tmp2 = tmp11 - tmp12; //->temps
     309             : 
     310             :             // Odd part of FDCT
     311             : 
     312           0 :             tmp10 = tmp4 + tmp5;
     313           0 :             tmp11 = tmp5 + tmp6;
     314           0 :             tmp12 = tmp6 + tmp7;
     315             : 
     316           0 :             z5 = MULTIPLY16H((tmp10 - tmp12) << 2, FIX_0_382683433);
     317           0 :             z2 = MULTIPLY16H(tmp10 << 2, FIX_0_541196100) + z5;
     318           0 :             z4 = MULTIPLY16H(tmp12 << 2, FIX_1_306562965) + z5;
     319           0 :             z3 = MULTIPLY16H(tmp11 << 2, FIX_0_707106781);
     320             : 
     321           0 :             z11 = tmp7 + z3;
     322           0 :             z13 = tmp7 - z3;
     323             : 
     324           0 :             d5 = z13 + z2;
     325           0 :             d3 = z13 - z2;
     326           0 :             d1 = z11 + z4;
     327           0 :             d7 = z11 - z4;
     328             : 
     329             :             // Odd part of IDCT
     330             : 
     331           0 :             THRESHOLD(tmp4, d1, threshold[1 * 8]);
     332           0 :             THRESHOLD(tmp5, d3, threshold[3 * 8]);
     333           0 :             THRESHOLD(tmp6, d5, threshold[5 * 8]);
     334           0 :             THRESHOLD(tmp7, d7, threshold[7 * 8]);
     335             : 
     336             :             //Simd version uses here a shortcut for the tmp5,tmp6,tmp7 == 0
     337           0 :             z13 = tmp6 + tmp5;
     338           0 :             z10 = (tmp6 - tmp5) << 1;
     339           0 :             z11 = tmp4 + tmp7;
     340           0 :             z12 = (tmp4 - tmp7) << 1;
     341             : 
     342           0 :             tmp7  = (z11 + z13) >> 2; //+2 !
     343           0 :             tmp11 = MULTIPLY16H((z11 - z13) << 1, FIX_1_414213562);
     344           0 :             z5    = MULTIPLY16H(z10 + z12,        FIX_1_847759065);
     345           0 :             tmp10 = MULTIPLY16H(z12,              FIX_1_082392200) - z5;
     346           0 :             tmp12 = MULTIPLY16H(z10,              FIX_2_613125930) + z5; // - !!
     347             : 
     348           0 :             tmp6 = tmp12 - tmp7;
     349           0 :             tmp5 = tmp11 - tmp6;
     350           0 :             tmp4 = tmp10 + tmp5;
     351             : 
     352           0 :             wsptr[DCTSIZE * 0] +=  (tmp0 + tmp7);
     353           0 :             wsptr[DCTSIZE * 1] +=  (tmp1 + tmp6);
     354           0 :             wsptr[DCTSIZE * 2] +=  (tmp2 + tmp5);
     355           0 :             wsptr[DCTSIZE * 3] +=  (tmp3 - tmp4);
     356           0 :             wsptr[DCTSIZE * 4] +=  (tmp3 + tmp4);
     357           0 :             wsptr[DCTSIZE * 5] +=  (tmp2 - tmp5);
     358           0 :             wsptr[DCTSIZE * 6]  =  (tmp1 - tmp6);
     359           0 :             wsptr[DCTSIZE * 7]  =  (tmp0 - tmp7);
     360             :             //
     361           0 :             dataptr++; //next column
     362           0 :             wsptr++;
     363           0 :             threshold++;
     364             :         }
     365           0 :         dataptr += 8; //skip each second start pos
     366           0 :         wsptr   += 8;
     367             :     }
     368           0 : }
     369             : 
     370           0 : static void row_idct_c(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt)
     371             : {
     372             :     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     373             :     int_simd16_t tmp10, tmp11, tmp12, tmp13;
     374             :     int_simd16_t z5, z10, z11, z12, z13;
     375             :     int16_t *outptr;
     376             :     int16_t *wsptr;
     377             : 
     378           0 :     cnt *= 4;
     379           0 :     wsptr = workspace;
     380           0 :     outptr = output_adr;
     381           0 :     for (; cnt > 0; cnt--) {
     382             :         // Even part
     383             :         //Simd version reads 4x4 block and transposes it
     384           0 :         tmp10 = wsptr[2] +  wsptr[3];
     385           0 :         tmp11 = wsptr[2] -  wsptr[3];
     386             : 
     387           0 :         tmp13 = wsptr[0] +  wsptr[1];
     388           0 :         tmp12 = (MULTIPLY16H(wsptr[0] - wsptr[1], FIX_1_414213562_A) << 2) - tmp13;//this shift order to avoid overflow
     389             : 
     390           0 :         tmp0 = tmp10 + tmp13; //->temps
     391           0 :         tmp3 = tmp10 - tmp13; //->temps
     392           0 :         tmp1 = tmp11 + tmp12;
     393           0 :         tmp2 = tmp11 - tmp12;
     394             : 
     395             :         // Odd part
     396             :         //Also transpose, with previous:
     397             :         // ---- ----      ||||
     398             :         // ---- ---- idct ||||
     399             :         // ---- ---- ---> ||||
     400             :         // ---- ----      ||||
     401           0 :         z13 = wsptr[4] + wsptr[5];
     402           0 :         z10 = wsptr[4] - wsptr[5];
     403           0 :         z11 = wsptr[6] + wsptr[7];
     404           0 :         z12 = wsptr[6] - wsptr[7];
     405             : 
     406           0 :         tmp7 = z11 + z13;
     407           0 :         tmp11 = MULTIPLY16H(z11 - z13, FIX_1_414213562);
     408             : 
     409           0 :         z5 =    MULTIPLY16H(z10 + z12, FIX_1_847759065);
     410           0 :         tmp10 = MULTIPLY16H(z12,       FIX_1_082392200) - z5;
     411           0 :         tmp12 = MULTIPLY16H(z10,       FIX_2_613125930) + z5; // - FIX_
     412             : 
     413           0 :         tmp6 = (tmp12 << 3) - tmp7;
     414           0 :         tmp5 = (tmp11 << 3) - tmp6;
     415           0 :         tmp4 = (tmp10 << 3) + tmp5;
     416             : 
     417             :         // Final output stage: descale and write column
     418           0 :         outptr[0 * output_stride] += DESCALE(tmp0 + tmp7, 3);
     419           0 :         outptr[1 * output_stride] += DESCALE(tmp1 + tmp6, 3);
     420           0 :         outptr[2 * output_stride] += DESCALE(tmp2 + tmp5, 3);
     421           0 :         outptr[3 * output_stride] += DESCALE(tmp3 - tmp4, 3);
     422           0 :         outptr[4 * output_stride] += DESCALE(tmp3 + tmp4, 3);
     423           0 :         outptr[5 * output_stride] += DESCALE(tmp2 - tmp5, 3);
     424           0 :         outptr[6 * output_stride] += DESCALE(tmp1 - tmp6, 3); //no += ?
     425           0 :         outptr[7 * output_stride] += DESCALE(tmp0 - tmp7, 3); //no += ?
     426           0 :         outptr++;
     427             : 
     428           0 :         wsptr += DCTSIZE;       // advance pointer to next row
     429             :     }
     430           0 : }
     431             : 
     432           0 : static void row_fdct_c(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt)
     433             : {
     434             :     int_simd16_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
     435             :     int_simd16_t tmp10, tmp11, tmp12, tmp13;
     436             :     int_simd16_t z1, z2, z3, z4, z5, z11, z13;
     437             :     int16_t *dataptr;
     438             : 
     439           0 :     cnt *= 4;
     440             :     // Pass 1: process rows.
     441             : 
     442           0 :     dataptr = data;
     443           0 :     for (; cnt > 0; cnt--) {
     444           0 :         tmp0 = pixels[line_size * 0] + pixels[line_size * 7];
     445           0 :         tmp7 = pixels[line_size * 0] - pixels[line_size * 7];
     446           0 :         tmp1 = pixels[line_size * 1] + pixels[line_size * 6];
     447           0 :         tmp6 = pixels[line_size * 1] - pixels[line_size * 6];
     448           0 :         tmp2 = pixels[line_size * 2] + pixels[line_size * 5];
     449           0 :         tmp5 = pixels[line_size * 2] - pixels[line_size * 5];
     450           0 :         tmp3 = pixels[line_size * 3] + pixels[line_size * 4];
     451           0 :         tmp4 = pixels[line_size * 3] - pixels[line_size * 4];
     452             : 
     453             :         // Even part
     454             : 
     455           0 :         tmp10 = tmp0 + tmp3;
     456           0 :         tmp13 = tmp0 - tmp3;
     457           0 :         tmp11 = tmp1 + tmp2;
     458           0 :         tmp12 = tmp1 - tmp2;
     459             :         //Even columns are written first, this leads to different order of columns
     460             :         //in column_fidct(), but they are processed independently, so all ok.
     461             :         //Later in the row_idct() columns readed at the same order.
     462           0 :         dataptr[2] = tmp10 + tmp11;
     463           0 :         dataptr[3] = tmp10 - tmp11;
     464             : 
     465           0 :         z1 = MULTIPLY16H((tmp12 + tmp13) << 2, FIX_0_707106781);
     466           0 :         dataptr[0] = tmp13 + z1;
     467           0 :         dataptr[1] = tmp13 - z1;
     468             : 
     469             :         // Odd part
     470             : 
     471           0 :         tmp10 = (tmp4 + tmp5) << 2;
     472           0 :         tmp11 = (tmp5 + tmp6) << 2;
     473           0 :         tmp12 = (tmp6 + tmp7) << 2;
     474             : 
     475           0 :         z5 = MULTIPLY16H(tmp10 - tmp12, FIX_0_382683433);
     476           0 :         z2 = MULTIPLY16H(tmp10,         FIX_0_541196100) + z5;
     477           0 :         z4 = MULTIPLY16H(tmp12,         FIX_1_306562965) + z5;
     478           0 :         z3 = MULTIPLY16H(tmp11,         FIX_0_707106781);
     479             : 
     480           0 :         z11 = tmp7 + z3;
     481           0 :         z13 = tmp7 - z3;
     482             : 
     483           0 :         dataptr[4] = z13 + z2;
     484           0 :         dataptr[5] = z13 - z2;
     485           0 :         dataptr[6] = z11 + z4;
     486           0 :         dataptr[7] = z11 - z4;
     487             : 
     488           0 :         pixels++;               // advance pointer to next column
     489           0 :         dataptr += DCTSIZE;
     490             :     }
     491           0 : }
     492             : 
     493           0 : static int query_formats(AVFilterContext *ctx)
     494             : {
     495             :     static const enum AVPixelFormat pix_fmts[] = {
     496             :         AV_PIX_FMT_YUV444P,  AV_PIX_FMT_YUV422P,
     497             :         AV_PIX_FMT_YUV420P,  AV_PIX_FMT_YUV411P,
     498             :         AV_PIX_FMT_YUV410P,  AV_PIX_FMT_YUV440P,
     499             :         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ422P,
     500             :         AV_PIX_FMT_YUVJ420P, AV_PIX_FMT_YUVJ440P,
     501             :         AV_PIX_FMT_GBRP, AV_PIX_FMT_GRAY8,
     502             :         AV_PIX_FMT_NONE
     503             :     };
     504             : 
     505           0 :     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
     506           0 :     if (!fmts_list)
     507           0 :         return AVERROR(ENOMEM);
     508           0 :     return ff_set_common_formats(ctx, fmts_list);
     509             : }
     510             : 
     511           0 : static int config_input(AVFilterLink *inlink)
     512             : {
     513           0 :     AVFilterContext *ctx = inlink->dst;
     514           0 :     FSPPContext *fspp = ctx->priv;
     515           0 :     const int h = FFALIGN(inlink->h + 16, 16);
     516           0 :     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
     517             : 
     518           0 :     fspp->hsub = desc->log2_chroma_w;
     519           0 :     fspp->vsub = desc->log2_chroma_h;
     520             : 
     521           0 :     fspp->temp_stride = FFALIGN(inlink->w + 16, 16);
     522           0 :     fspp->temp = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->temp));
     523           0 :     fspp->src  = av_malloc_array(fspp->temp_stride, h * sizeof(*fspp->src));
     524             : 
     525           0 :     if (!fspp->temp || !fspp->src)
     526           0 :         return AVERROR(ENOMEM);
     527             : 
     528           0 :     if (!fspp->use_bframe_qp && !fspp->qp) {
     529           0 :         fspp->non_b_qp_alloc_size = AV_CEIL_RSHIFT(inlink->w, 4) * AV_CEIL_RSHIFT(inlink->h, 4);
     530           0 :         fspp->non_b_qp_table = av_calloc(fspp->non_b_qp_alloc_size, sizeof(*fspp->non_b_qp_table));
     531           0 :         if (!fspp->non_b_qp_table)
     532           0 :             return AVERROR(ENOMEM);
     533             :     }
     534             : 
     535           0 :     fspp->store_slice  = store_slice_c;
     536           0 :     fspp->store_slice2 = store_slice2_c;
     537           0 :     fspp->mul_thrmat   = mul_thrmat_c;
     538           0 :     fspp->column_fidct = column_fidct_c;
     539           0 :     fspp->row_idct     = row_idct_c;
     540           0 :     fspp->row_fdct     = row_fdct_c;
     541             : 
     542             :     if (ARCH_X86)
     543           0 :         ff_fspp_init_x86(fspp);
     544             : 
     545           0 :     return 0;
     546             : }
     547             : 
     548           0 : static int filter_frame(AVFilterLink *inlink, AVFrame *in)
     549             : {
     550           0 :     AVFilterContext *ctx = inlink->dst;
     551           0 :     FSPPContext *fspp = ctx->priv;
     552           0 :     AVFilterLink *outlink = ctx->outputs[0];
     553           0 :     AVFrame *out = in;
     554             : 
     555           0 :     int qp_stride = 0;
     556           0 :     uint8_t *qp_table = NULL;
     557             :     int i, bias;
     558             :     int custom_threshold_m[64];
     559             : 
     560           0 :     bias = (1 << 4) + fspp->strength;
     561             : 
     562           0 :     for (i = 0; i < 64; i++) //FIXME: tune custom_threshold[] and remove this !
     563           0 :         custom_threshold_m[i] = (int)(custom_threshold[i] * (bias / 71.0) + 0.5);
     564             : 
     565           0 :     for (i = 0; i < 8; i++) {
     566           0 :         fspp->threshold_mtx_noq[2 * i] = (uint64_t)custom_threshold_m[i * 8 + 2]
     567           0 :                                       |(((uint64_t)custom_threshold_m[i * 8 + 6]) << 16)
     568           0 :                                       |(((uint64_t)custom_threshold_m[i * 8 + 0]) << 32)
     569           0 :                                       |(((uint64_t)custom_threshold_m[i * 8 + 4]) << 48);
     570             : 
     571           0 :         fspp->threshold_mtx_noq[2 * i + 1] = (uint64_t)custom_threshold_m[i * 8 + 5]
     572           0 :                                           |(((uint64_t)custom_threshold_m[i * 8 + 3]) << 16)
     573           0 :                                           |(((uint64_t)custom_threshold_m[i * 8 + 1]) << 32)
     574           0 :                                           |(((uint64_t)custom_threshold_m[i * 8 + 7]) << 48);
     575             :     }
     576             : 
     577           0 :     if (fspp->qp)
     578           0 :         fspp->prev_q = fspp->qp, fspp->mul_thrmat((int16_t *)(&fspp->threshold_mtx_noq[0]), (int16_t *)(&fspp->threshold_mtx[0]), fspp->qp);
     579             : 
     580             :     /* if we are not in a constant user quantizer mode and we don't want to use
     581             :      * the quantizers from the B-frames (B-frames often have a higher QP), we
     582             :      * need to save the qp table from the last non B-frame; this is what the
     583             :      * following code block does */
     584           0 :     if (!fspp->qp) {
     585           0 :         qp_table = av_frame_get_qp_table(in, &qp_stride, &fspp->qscale_type);
     586             : 
     587           0 :         if (qp_table && !fspp->use_bframe_qp && in->pict_type != AV_PICTURE_TYPE_B) {
     588             :             int w, h;
     589             : 
     590             :             /* if the qp stride is not set, it means the QP are only defined on
     591             :              * a line basis */
     592           0 :            if (!qp_stride) {
     593           0 :                 w = AV_CEIL_RSHIFT(inlink->w, 4);
     594           0 :                 h = 1;
     595             :             } else {
     596           0 :                 w = qp_stride;
     597           0 :                 h = AV_CEIL_RSHIFT(inlink->h, 4);
     598             :             }
     599           0 :             if (w * h > fspp->non_b_qp_alloc_size) {
     600           0 :                 int ret = av_reallocp_array(&fspp->non_b_qp_table, w, h);
     601           0 :                 if (ret < 0) {
     602           0 :                     fspp->non_b_qp_alloc_size = 0;
     603           0 :                     return ret;
     604             :                 }
     605           0 :                 fspp->non_b_qp_alloc_size = w * h;
     606             :             }
     607             : 
     608           0 :             av_assert0(w * h <= fspp->non_b_qp_alloc_size);
     609           0 :             memcpy(fspp->non_b_qp_table, qp_table, w * h);
     610             :         }
     611             :     }
     612             : 
     613           0 :     if (fspp->log2_count && !ctx->is_disabled) {
     614           0 :         if (!fspp->use_bframe_qp && fspp->non_b_qp_table)
     615           0 :             qp_table = fspp->non_b_qp_table;
     616             : 
     617           0 :         if (qp_table || fspp->qp) {
     618           0 :             const int cw = AV_CEIL_RSHIFT(inlink->w, fspp->hsub);
     619           0 :             const int ch = AV_CEIL_RSHIFT(inlink->h, fspp->vsub);
     620             : 
     621             :             /* get a new frame if in-place is not possible or if the dimensions
     622             :              * are not multiple of 8 */
     623           0 :             if (!av_frame_is_writable(in) || (inlink->w & 7) || (inlink->h & 7)) {
     624           0 :                 const int aligned_w = FFALIGN(inlink->w, 8);
     625           0 :                 const int aligned_h = FFALIGN(inlink->h, 8);
     626             : 
     627           0 :                 out = ff_get_video_buffer(outlink, aligned_w, aligned_h);
     628           0 :                 if (!out) {
     629           0 :                     av_frame_free(&in);
     630           0 :                     return AVERROR(ENOMEM);
     631             :                 }
     632           0 :                 av_frame_copy_props(out, in);
     633           0 :                 out->width = in->width;
     634           0 :                 out->height = in->height;
     635             :             }
     636             : 
     637           0 :             filter(fspp, out->data[0], in->data[0], out->linesize[0], in->linesize[0],
     638             :                    inlink->w, inlink->h, qp_table, qp_stride, 1);
     639           0 :             filter(fspp, out->data[1], in->data[1], out->linesize[1], in->linesize[1],
     640             :                    cw,        ch,        qp_table, qp_stride, 0);
     641           0 :             filter(fspp, out->data[2], in->data[2], out->linesize[2], in->linesize[2],
     642             :                    cw,        ch,        qp_table, qp_stride, 0);
     643           0 :             emms_c();
     644             :         }
     645             :     }
     646             : 
     647           0 :     if (in != out) {
     648           0 :         if (in->data[3])
     649           0 :             av_image_copy_plane(out->data[3], out->linesize[3],
     650           0 :                                 in ->data[3], in ->linesize[3],
     651             :                                 inlink->w, inlink->h);
     652           0 :         av_frame_free(&in);
     653             :     }
     654           0 :     return ff_filter_frame(outlink, out);
     655             : }
     656             : 
     657           0 : static av_cold void uninit(AVFilterContext *ctx)
     658             : {
     659           0 :     FSPPContext *fspp = ctx->priv;
     660           0 :     av_freep(&fspp->temp);
     661           0 :     av_freep(&fspp->src);
     662           0 :     av_freep(&fspp->non_b_qp_table);
     663           0 : }
     664             : 
     665             : static const AVFilterPad fspp_inputs[] = {
     666             :     {
     667             :         .name         = "default",
     668             :         .type         = AVMEDIA_TYPE_VIDEO,
     669             :         .config_props = config_input,
     670             :         .filter_frame = filter_frame,
     671             :     },
     672             :     { NULL }
     673             : };
     674             : 
     675             : static const AVFilterPad fspp_outputs[] = {
     676             :     {
     677             :         .name = "default",
     678             :         .type = AVMEDIA_TYPE_VIDEO,
     679             :     },
     680             :     { NULL }
     681             : };
     682             : 
     683             : AVFilter ff_vf_fspp = {
     684             :     .name            = "fspp",
     685             :     .description     = NULL_IF_CONFIG_SMALL("Apply Fast Simple Post-processing filter."),
     686             :     .priv_size       = sizeof(FSPPContext),
     687             :     .uninit          = uninit,
     688             :     .query_formats   = query_formats,
     689             :     .inputs          = fspp_inputs,
     690             :     .outputs         = fspp_outputs,
     691             :     .priv_class      = &fspp_class,
     692             :     .flags           = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
     693             : };

Generated by: LCOV version 1.13