LCOV - code coverage report
Current view: top level - libavfilter - vf_nnedi.c (source / functions) Hit Total Coverage
Test: coverage.info Lines: 0 641 0.0 %
Date: 2017-12-14 08:27:08 Functions: 0 28 0.0 %

          Line data    Source code
       1             : /*
       2             :  * Copyright (C) 2010-2011 Kevin Stone
       3             :  * Copyright (C) 2016 Paul B Mahol
       4             :  *
       5             :  * This file is part of FFmpeg.
       6             :  *
       7             :  * FFmpeg is free software; you can redistribute it and/or modify
       8             :  * it under the terms of the GNU General Public License as published by
       9             :  * the Free Software Foundation; either version 2 of the License, or
      10             :  * (at your option) any later version.
      11             :  *
      12             :  * FFmpeg is distributed in the hope that it will be useful,
      13             :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      14             :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      15             :  * GNU General Public License for more details.
      16             :  *
      17             :  * You should have received a copy of the GNU General Public License along
      18             :  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
      19             :  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
      20             :  */
      21             : 
      22             : #include <float.h>
      23             : 
      24             : #include "libavutil/common.h"
      25             : #include "libavutil/float_dsp.h"
      26             : #include "libavutil/imgutils.h"
      27             : #include "libavutil/opt.h"
      28             : #include "libavutil/pixdesc.h"
      29             : #include "avfilter.h"
      30             : #include "formats.h"
      31             : #include "internal.h"
      32             : #include "video.h"
      33             : 
      34             : typedef struct FrameData {
      35             :     uint8_t *paddedp[3];
      36             :     int padded_stride[3];
      37             :     int padded_width[3];
      38             :     int padded_height[3];
      39             : 
      40             :     uint8_t *dstp[3];
      41             :     int dst_stride[3];
      42             : 
      43             :     int field[3];
      44             : 
      45             :     int32_t *lcount[3];
      46             :     float *input;
      47             :     float *temp;
      48             : } FrameData;
      49             : 
      50             : typedef struct NNEDIContext {
      51             :     const AVClass *class;
      52             : 
      53             :     char *weights_file;
      54             : 
      55             :     AVFrame *src;
      56             :     AVFrame *second;
      57             :     AVFrame *dst;
      58             :     int eof;
      59             :     int64_t cur_pts;
      60             : 
      61             :     AVFloatDSPContext *fdsp;
      62             :     int nb_planes;
      63             :     int linesize[4];
      64             :     int planeheight[4];
      65             : 
      66             :     float *weights0;
      67             :     float *weights1[2];
      68             :     int asize;
      69             :     int nns;
      70             :     int xdia;
      71             :     int ydia;
      72             : 
      73             :     // Parameters
      74             :     int deint;
      75             :     int field;
      76             :     int process_plane;
      77             :     int nsize;
      78             :     int nnsparam;
      79             :     int qual;
      80             :     int etype;
      81             :     int pscrn;
      82             :     int fapprox;
      83             : 
      84             :     int max_value;
      85             : 
      86             :     void (*copy_pad)(const AVFrame *, FrameData *, struct NNEDIContext *, int);
      87             :     void (*evalfunc_0)(struct NNEDIContext *, FrameData *);
      88             :     void (*evalfunc_1)(struct NNEDIContext *, FrameData *);
      89             : 
      90             :     // Functions used in evalfunc_0
      91             :     void (*readpixels)(const uint8_t *, const int, float *);
      92             :     void (*compute_network0)(struct NNEDIContext *s, const float *, const float *, uint8_t *);
      93             :     int32_t (*process_line0)(const uint8_t *, int, uint8_t *, const uint8_t *, const int, const int, const int);
      94             : 
      95             :     // Functions used in evalfunc_1
      96             :     void (*extract)(const uint8_t *, const int, const int, const int, float *, float *);
      97             :     void (*dot_prod)(struct NNEDIContext *, const float *, const float *, float *, const int, const int, const float *);
      98             :     void (*expfunc)(float *, const int);
      99             :     void (*wae5)(const float *, const int, float *);
     100             : 
     101             :     FrameData frame_data;
     102             : } NNEDIContext;
     103             : 
     104             : #define OFFSET(x) offsetof(NNEDIContext, x)
     105             : #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
     106             : 
     107             : static const AVOption nnedi_options[] = {
     108             :     {"weights",  "set weights file", OFFSET(weights_file),  AV_OPT_TYPE_STRING, {.str="nnedi3_weights.bin"}, 0, 0, FLAGS },
     109             :     {"deint",         "set which frames to deinterlace", OFFSET(deint),         AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS, "deint" },
     110             :         {"all",        "deinterlace all frames",                       0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "deint" },
     111             :         {"interlaced", "only deinterlace frames marked as interlaced", 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "deint" },
     112             :     {"field",  "set mode of operation", OFFSET(field),         AV_OPT_TYPE_INT, {.i64=-1}, -2, 3, FLAGS, "field" },
     113             :         {"af", "use frame flags, both fields",  0, AV_OPT_TYPE_CONST, {.i64=-2}, 0, 0, FLAGS, "field" },
     114             :         {"a",  "use frame flags, single field", 0, AV_OPT_TYPE_CONST, {.i64=-1}, 0, 0, FLAGS, "field" },
     115             :         {"t",  "use top field only",            0, AV_OPT_TYPE_CONST, {.i64=0},  0, 0, FLAGS, "field" },
     116             :         {"b",  "use bottom field only",         0, AV_OPT_TYPE_CONST, {.i64=1},  0, 0, FLAGS, "field" },
     117             :         {"tf", "use both fields, top first",    0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "field" },
     118             :         {"bf", "use both fields, bottom first", 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "field" },
     119             :     {"planes", "set which planes to process", OFFSET(process_plane), AV_OPT_TYPE_INT, {.i64=7}, 0, 7, FLAGS },
     120             :     {"nsize",  "set size of local neighborhood around each pixel, used by the predictor neural network", OFFSET(nsize), AV_OPT_TYPE_INT, {.i64=6}, 0, 6, FLAGS, "nsize" },
     121             :         {"s8x6",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "nsize" },
     122             :         {"s16x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "nsize" },
     123             :         {"s32x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "nsize" },
     124             :         {"s48x6",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "nsize" },
     125             :         {"s8x4",     NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, FLAGS, "nsize" },
     126             :         {"s16x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=5}, 0, 0, FLAGS, "nsize" },
     127             :         {"s32x4",    NULL, 0, AV_OPT_TYPE_CONST, {.i64=6}, 0, 0, FLAGS, "nsize" },
     128             :     {"nns",    "set number of neurons in predictor neural network", OFFSET(nnsparam), AV_OPT_TYPE_INT, {.i64=1}, 0, 4, FLAGS, "nns" },
     129             :         {"n16",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "nns" },
     130             :         {"n32",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "nns" },
     131             :         {"n64",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "nns" },
     132             :         {"n128",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=3}, 0, 0, FLAGS, "nns" },
     133             :         {"n256",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=4}, 0, 0, FLAGS, "nns" },
     134             :     {"qual",  "set quality", OFFSET(qual), AV_OPT_TYPE_INT, {.i64=1}, 1, 2, FLAGS, "qual" },
     135             :         {"fast", NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "qual" },
     136             :         {"slow", NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "qual" },
     137             :     {"etype", "set which set of weights to use in the predictor", OFFSET(etype), AV_OPT_TYPE_INT, {.i64=0}, 0, 1, FLAGS, "etype" },
     138             :         {"a",  "weights trained to minimize absolute error", 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "etype" },
     139             :         {"s",  "weights trained to minimize squared error",  0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "etype" },
     140             :     {"pscrn", "set prescreening", OFFSET(pscrn), AV_OPT_TYPE_INT, {.i64=2}, 0, 2, FLAGS, "pscrn" },
     141             :         {"none",      NULL, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "pscrn" },
     142             :         {"original",  NULL, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "pscrn" },
     143             :         {"new",       NULL, 0, AV_OPT_TYPE_CONST, {.i64=2}, 0, 0, FLAGS, "pscrn" },
     144             :     {"fapprox",       NULL, OFFSET(fapprox),       AV_OPT_TYPE_INT, {.i64=0}, 0, 3, FLAGS },
     145             :     { NULL }
     146             : };
     147             : 
     148             : AVFILTER_DEFINE_CLASS(nnedi);
     149             : 
     150           0 : static int config_input(AVFilterLink *inlink)
     151             : {
     152           0 :     AVFilterContext *ctx = inlink->dst;
     153           0 :     NNEDIContext *s = ctx->priv;
     154           0 :     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(inlink->format);
     155             :     int ret;
     156             : 
     157           0 :     s->nb_planes = av_pix_fmt_count_planes(inlink->format);
     158           0 :     if ((ret = av_image_fill_linesizes(s->linesize, inlink->format, inlink->w)) < 0)
     159           0 :         return ret;
     160             : 
     161           0 :     s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
     162           0 :     s->planeheight[0] = s->planeheight[3] = inlink->h;
     163             : 
     164           0 :     return 0;
     165             : }
     166             : 
     167           0 : static int config_output(AVFilterLink *outlink)
     168             : {
     169           0 :     AVFilterContext *ctx = outlink->src;
     170           0 :     NNEDIContext *s = ctx->priv;
     171             : 
     172           0 :     outlink->time_base.num = ctx->inputs[0]->time_base.num;
     173           0 :     outlink->time_base.den = ctx->inputs[0]->time_base.den * 2;
     174           0 :     outlink->w             = ctx->inputs[0]->w;
     175           0 :     outlink->h             = ctx->inputs[0]->h;
     176             : 
     177           0 :     if (s->field > 1 || s->field == -2)
     178           0 :         outlink->frame_rate = av_mul_q(ctx->inputs[0]->frame_rate,
     179           0 :                                        (AVRational){2, 1});
     180             : 
     181           0 :     return 0;
     182             : }
     183             : 
     184           0 : static int query_formats(AVFilterContext *ctx)
     185             : {
     186             :     static const enum AVPixelFormat pix_fmts[] = {
     187             :         AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
     188             :         AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
     189             :         AV_PIX_FMT_YUV440P, AV_PIX_FMT_YUV444P,
     190             :         AV_PIX_FMT_YUVJ444P, AV_PIX_FMT_YUVJ440P,
     191             :         AV_PIX_FMT_YUVJ422P, AV_PIX_FMT_YUVJ420P,
     192             :         AV_PIX_FMT_YUVJ411P,
     193             :         AV_PIX_FMT_GBRP,
     194             :         AV_PIX_FMT_GRAY8,
     195             :         AV_PIX_FMT_NONE
     196             :     };
     197             : 
     198           0 :     AVFilterFormats *fmts_list = ff_make_format_list(pix_fmts);
     199           0 :     if (!fmts_list)
     200           0 :         return AVERROR(ENOMEM);
     201           0 :     return ff_set_common_formats(ctx, fmts_list);
     202             : }
     203             : 
     204           0 : static void copy_pad(const AVFrame *src, FrameData *frame_data, NNEDIContext *s, int fn)
     205             : {
     206           0 :     const int off = 1 - fn;
     207             :     int plane, y, x;
     208             : 
     209           0 :     for (plane = 0; plane < s->nb_planes; plane++) {
     210           0 :         const uint8_t *srcp = (const uint8_t *)src->data[plane];
     211           0 :         uint8_t *dstp = (uint8_t *)frame_data->paddedp[plane];
     212             : 
     213           0 :         const int src_stride = src->linesize[plane];
     214           0 :         const int dst_stride = frame_data->padded_stride[plane];
     215             : 
     216           0 :         const int src_height = s->planeheight[plane];
     217           0 :         const int dst_height = frame_data->padded_height[plane];
     218             : 
     219           0 :         const int src_width = s->linesize[plane];
     220           0 :         const int dst_width = frame_data->padded_width[plane];
     221             : 
     222           0 :         int c = 4;
     223             : 
     224           0 :         if (!(s->process_plane & (1 << plane)))
     225           0 :             continue;
     226             : 
     227             :         // Copy.
     228           0 :         for (y = off; y < src_height; y += 2)
     229           0 :             memcpy(dstp + 32 + (6 + y) * dst_stride,
     230           0 :                    srcp + y * src_stride,
     231             :                    src_width * sizeof(uint8_t));
     232             : 
     233             :         // And pad.
     234           0 :         dstp += (6 + off) * dst_stride;
     235           0 :         for (y = 6 + off; y < dst_height - 6; y += 2) {
     236           0 :             int c = 2;
     237             : 
     238           0 :             for (x = 0; x < 32; x++)
     239           0 :                 dstp[x] = dstp[64 - x];
     240             : 
     241           0 :             for (x = dst_width - 32; x < dst_width; x++, c += 2)
     242           0 :                 dstp[x] = dstp[x - c];
     243             : 
     244           0 :             dstp += dst_stride * 2;
     245             :         }
     246             : 
     247           0 :         dstp = (uint8_t *)frame_data->paddedp[plane];
     248           0 :         for (y = off; y < 6; y += 2)
     249           0 :             memcpy(dstp + y * dst_stride,
     250           0 :                    dstp + (12 + 2 * off - y) * dst_stride,
     251             :                    dst_width * sizeof(uint8_t));
     252             : 
     253           0 :         for (y = dst_height - 6 + off; y < dst_height; y += 2, c += 4)
     254           0 :             memcpy(dstp + y * dst_stride,
     255           0 :                    dstp + (y - c) * dst_stride,
     256             :                    dst_width * sizeof(uint8_t));
     257             :     }
     258           0 : }
     259             : 
     260           0 : static void elliott(float *data, const int n)
     261             : {
     262             :     int i;
     263             : 
     264           0 :     for (i = 0; i < n; i++)
     265           0 :         data[i] = data[i] / (1.0f + FFABS(data[i]));
     266           0 : }
     267             : 
     268           0 : static void dot_prod(NNEDIContext *s, const float *data, const float *weights, float *vals, const int n, const int len, const float *scale)
     269             : {
     270             :     int i;
     271             : 
     272           0 :     for (i = 0; i < n; i++) {
     273             :         float sum;
     274             : 
     275           0 :         sum = s->fdsp->scalarproduct_float(data, &weights[i * len], len);
     276             : 
     277           0 :         vals[i] = sum * scale[0] + weights[n * len + i];
     278             :     }
     279           0 : }
     280             : 
     281           0 : static void dot_prods(NNEDIContext *s, const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *scale)
     282             : {
     283           0 :     const int16_t *data = (int16_t *)dataf;
     284           0 :     const int16_t *weights = (int16_t *)weightsf;
     285           0 :     const float *wf = (float *)&weights[n * len];
     286             :     int i, j;
     287             : 
     288           0 :     for (i = 0; i < n; i++) {
     289           0 :         int sum = 0, off = ((i >> 2) << 3) + (i & 3);
     290           0 :         for (j = 0; j < len; j++)
     291           0 :             sum += data[j] * weights[i * len + j];
     292             : 
     293           0 :         vals[i] = sum * wf[off] * scale[0] + wf[off + 4];
     294             :     }
     295           0 : }
     296             : 
     297           0 : static void compute_network0(NNEDIContext *s, const float *input, const float *weights, uint8_t *d)
     298             : {
     299           0 :     float t, temp[12], scale = 1.0f;
     300             : 
     301           0 :     dot_prod(s, input, weights, temp, 4, 48, &scale);
     302           0 :     t = temp[0];
     303           0 :     elliott(temp, 4);
     304           0 :     temp[0] = t;
     305           0 :     dot_prod(s, temp, weights + 4 * 49, temp + 4, 4, 4, &scale);
     306           0 :     elliott(temp + 4, 4);
     307           0 :     dot_prod(s, temp, weights + 4 * 49 + 4 * 5, temp + 8, 4, 8, &scale);
     308           0 :     if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
     309           0 :         d[0] = 1;
     310             :     else
     311           0 :         d[0] = 0;
     312           0 : }
     313             : 
     314           0 : static void compute_network0_i16(NNEDIContext *s, const float *inputf, const float *weightsf, uint8_t *d)
     315             : {
     316           0 :     const float *wf = weightsf + 2 * 48;
     317           0 :     float t, temp[12], scale = 1.0f;
     318             : 
     319           0 :     dot_prods(s, inputf, weightsf, temp, 4, 48, &scale);
     320           0 :     t = temp[0];
     321           0 :     elliott(temp, 4);
     322           0 :     temp[0] = t;
     323           0 :     dot_prod(s, temp, wf + 8, temp + 4, 4, 4, &scale);
     324           0 :     elliott(temp + 4, 4);
     325           0 :     dot_prod(s, temp, wf + 8 + 4 * 5, temp + 8, 4, 8, &scale);
     326           0 :     if (FFMAX(temp[10], temp[11]) <= FFMAX(temp[8], temp[9]))
     327           0 :         d[0] = 1;
     328             :     else
     329           0 :         d[0] = 0;
     330           0 : }
     331             : 
     332           0 : static void pixel2float48(const uint8_t *t8, const int pitch, float *p)
     333             : {
     334           0 :     const uint8_t *t = (const uint8_t *)t8;
     335             :     int y, x;
     336             : 
     337           0 :     for (y = 0; y < 4; y++)
     338           0 :         for (x = 0; x < 12; x++)
     339           0 :             p[y * 12 + x] = t[y * pitch * 2 + x];
     340           0 : }
     341             : 
     342           0 : static void byte2word48(const uint8_t *t, const int pitch, float *pf)
     343             : {
     344           0 :     int16_t *p = (int16_t *)pf;
     345             :     int y, x;
     346             : 
     347           0 :     for (y = 0; y < 4; y++)
     348           0 :         for (x = 0; x < 12; x++)
     349           0 :             p[y * 12 + x] = t[y * pitch * 2 + x];
     350           0 : }
     351             : 
     352           0 : static int32_t process_line0(const uint8_t *tempu, int width, uint8_t *dstp8, const uint8_t *src3p8, const int src_pitch, const int max_value, const int chroma)
     353             : {
     354           0 :     uint8_t *dstp = (uint8_t *)dstp8;
     355           0 :     const uint8_t *src3p = (const uint8_t *)src3p8;
     356           0 :     int minimum = 0;
     357           0 :     int maximum = max_value - 1; // Technically the -1 is only needed for 8 and 16 bit input.
     358           0 :     int count = 0, x;
     359           0 :     for (x = 0; x < width; x++) {
     360           0 :         if (tempu[x]) {
     361           0 :             int tmp = 19 * (src3p[x + src_pitch * 2] + src3p[x + src_pitch * 4]) - 3 * (src3p[x] + src3p[x + src_pitch * 6]);
     362           0 :             tmp /= 32;
     363           0 :             dstp[x] = FFMAX(FFMIN(tmp, maximum), minimum);
     364             :         } else {
     365           0 :             dstp[x] = 255;
     366           0 :             count++;
     367             :         }
     368             :     }
     369           0 :     return count;
     370             : }
     371             : 
     372             : // new prescreener functions
     373           0 : static void byte2word64(const uint8_t *t, const int pitch, float *p)
     374             : {
     375           0 :     int16_t *ps = (int16_t *)p;
     376             :     int y, x;
     377             : 
     378           0 :     for (y = 0; y < 4; y++)
     379           0 :         for (x = 0; x < 16; x++)
     380           0 :             ps[y * 16 + x] = t[y * pitch * 2 + x];
     381           0 : }
     382             : 
     383           0 : static void compute_network0new(NNEDIContext *s, const float *datai, const float *weights, uint8_t *d)
     384             : {
     385           0 :     int16_t *data = (int16_t *)datai;
     386           0 :     int16_t *ws = (int16_t *)weights;
     387           0 :     float *wf = (float *)&ws[4 * 64];
     388             :     float vals[8];
     389             :     int mask, i, j;
     390             : 
     391           0 :     for (i = 0; i < 4; i++) {
     392           0 :         int sum = 0;
     393             :         float t;
     394             : 
     395           0 :         for (j = 0; j < 64; j++)
     396           0 :             sum += data[j] * ws[(i << 3) + ((j >> 3) << 5) + (j & 7)];
     397           0 :         t = sum * wf[i] + wf[4 + i];
     398           0 :         vals[i] = t / (1.0f + FFABS(t));
     399             :     }
     400             : 
     401           0 :     for (i = 0; i < 4; i++) {
     402           0 :         float sum = 0.0f;
     403             : 
     404           0 :         for (j = 0; j < 4; j++)
     405           0 :             sum += vals[j] * wf[8 + i + (j << 2)];
     406           0 :         vals[4 + i] = sum + wf[8 + 16 + i];
     407             :     }
     408             : 
     409           0 :     mask = 0;
     410           0 :     for (i = 0; i < 4; i++) {
     411           0 :         if (vals[4 + i] > 0.0f)
     412           0 :             mask |= (0x1 << (i << 3));
     413             :     }
     414             : 
     415           0 :     ((int *)d)[0] = mask;
     416           0 : }
     417             : 
     418           0 : static void evalfunc_0(NNEDIContext *s, FrameData *frame_data)
     419             : {
     420           0 :     float *input = frame_data->input;
     421           0 :     const float *weights0 = s->weights0;
     422           0 :     float *temp = frame_data->temp;
     423           0 :     uint8_t *tempu = (uint8_t *)temp;
     424             :     int plane, x, y;
     425             : 
     426             :     // And now the actual work.
     427           0 :     for (plane = 0; plane < s->nb_planes; plane++) {
     428           0 :         const uint8_t *srcp = (const uint8_t *)frame_data->paddedp[plane];
     429           0 :         const int src_stride = frame_data->padded_stride[plane] / sizeof(uint8_t);
     430             : 
     431           0 :         const int width = frame_data->padded_width[plane];
     432           0 :         const int height = frame_data->padded_height[plane];
     433             : 
     434           0 :         uint8_t *dstp = (uint8_t *)frame_data->dstp[plane];
     435           0 :         const int dst_stride = frame_data->dst_stride[plane] / sizeof(uint8_t);
     436             :         const uint8_t *src3p;
     437             :         int ystart, ystop;
     438             :         int32_t *lcount;
     439             : 
     440           0 :         if (!(s->process_plane & (1 << plane)))
     441           0 :             continue;
     442             : 
     443           0 :         for (y = 1 - frame_data->field[plane]; y < height - 12; y += 2) {
     444           0 :             memcpy(dstp + y * dst_stride,
     445           0 :                    srcp + 32 + (6 + y) * src_stride,
     446           0 :                    (width - 64) * sizeof(uint8_t));
     447             : 
     448             :         }
     449             : 
     450           0 :         ystart = 6 + frame_data->field[plane];
     451           0 :         ystop = height - 6;
     452           0 :         srcp += ystart * src_stride;
     453           0 :         dstp += (ystart - 6) * dst_stride - 32;
     454           0 :         src3p = srcp - src_stride * 3;
     455           0 :         lcount = frame_data->lcount[plane] - 6;
     456             : 
     457           0 :         if (s->pscrn == 1) { // original
     458           0 :             for (y = ystart; y < ystop; y += 2) {
     459           0 :                 for (x = 32; x < width - 32; x++) {
     460           0 :                     s->readpixels((const uint8_t *)(src3p + x - 5), src_stride, input);
     461           0 :                     s->compute_network0(s, input, weights0, tempu+x);
     462             :                 }
     463           0 :                 lcount[y] += s->process_line0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, s->max_value, plane);
     464           0 :                 src3p += src_stride * 2;
     465           0 :                 dstp += dst_stride * 2;
     466             :             }
     467           0 :         } else if (s->pscrn > 1) { // new
     468           0 :             for (y = ystart; y < ystop; y += 2) {
     469           0 :                 for (x = 32; x < width - 32; x += 4) {
     470           0 :                     s->readpixels((const uint8_t *)(src3p + x - 6), src_stride, input);
     471           0 :                     s->compute_network0(s, input, weights0, tempu + x);
     472             :                 }
     473           0 :                 lcount[y] += s->process_line0(tempu + 32, width - 64, (uint8_t *)(dstp + 32), (const uint8_t *)(src3p + 32), src_stride, s->max_value, plane);
     474           0 :                 src3p += src_stride * 2;
     475           0 :                 dstp += dst_stride * 2;
     476             :             }
     477             :         } else { // no prescreening
     478           0 :             for (y = ystart; y < ystop; y += 2) {
     479           0 :                 memset(dstp + 32, 255, (width - 64) * sizeof(uint8_t));
     480           0 :                 lcount[y] += width - 64;
     481           0 :                 dstp += dst_stride * 2;
     482             :             }
     483             :         }
     484             :     }
     485           0 : }
     486             : 
     487           0 : static void extract_m8(const uint8_t *srcp8, const int stride, const int xdia, const int ydia, float *mstd, float *input)
     488             : {
     489             :     // uint8_t or uint16_t or float
     490           0 :     const uint8_t *srcp = (const uint8_t *)srcp8;
     491             :     float scale;
     492             :     double tmp;
     493             : 
     494             :     // int32_t or int64_t or double
     495           0 :     int64_t sum = 0, sumsq = 0;
     496             :     int y, x;
     497             : 
     498           0 :     for (y = 0; y < ydia; y++) {
     499           0 :         const uint8_t *srcpT = srcp + y * stride * 2;
     500             : 
     501           0 :         for (x = 0; x < xdia; x++) {
     502           0 :             sum += srcpT[x];
     503           0 :             sumsq += (uint32_t)srcpT[x] * (uint32_t)srcpT[x];
     504           0 :             input[x] = srcpT[x];
     505             :         }
     506           0 :         input += xdia;
     507             :     }
     508           0 :     scale = 1.0f / (xdia * ydia);
     509           0 :     mstd[0] = sum * scale;
     510           0 :     tmp = (double)sumsq * scale - (double)mstd[0] * mstd[0];
     511           0 :     mstd[3] = 0.0f;
     512           0 :     if (tmp <= FLT_EPSILON)
     513           0 :         mstd[1] = mstd[2] = 0.0f;
     514             :     else {
     515           0 :         mstd[1] = sqrt(tmp);
     516           0 :         mstd[2] = 1.0f / mstd[1];
     517             :     }
     518           0 : }
     519             : 
     520           0 : static void extract_m8_i16(const uint8_t *srcp, const int stride, const int xdia, const int ydia, float *mstd, float *inputf)
     521             : {
     522           0 :     int16_t *input = (int16_t *)inputf;
     523             :     float scale;
     524           0 :     int sum = 0, sumsq = 0;
     525             :     int y, x;
     526             : 
     527           0 :     for (y = 0; y < ydia; y++) {
     528           0 :         const uint8_t *srcpT = srcp + y * stride * 2;
     529           0 :         for (x = 0; x < xdia; x++) {
     530           0 :             sum += srcpT[x];
     531           0 :             sumsq += srcpT[x] * srcpT[x];
     532           0 :             input[x] = srcpT[x];
     533             :         }
     534           0 :         input += xdia;
     535             :     }
     536           0 :     scale = 1.0f / (float)(xdia * ydia);
     537           0 :     mstd[0] = sum * scale;
     538           0 :     mstd[1] = sumsq * scale - mstd[0] * mstd[0];
     539           0 :     mstd[3] = 0.0f;
     540           0 :     if (mstd[1] <= FLT_EPSILON)
     541           0 :         mstd[1] = mstd[2] = 0.0f;
     542             :     else {
     543           0 :         mstd[1] = sqrt(mstd[1]);
     544           0 :         mstd[2] = 1.0f / mstd[1];
     545             :     }
     546           0 : }
     547             : 
     548             : 
     549             : static const float exp_lo = -80.0f;
     550             : static const float exp_hi = +80.0f;
     551             : 
     552           0 : static void e2_m16(float *s, const int n)
     553             : {
     554             :     int i;
     555             : 
     556           0 :     for (i = 0; i < n; i++)
     557           0 :         s[i] = exp(av_clipf(s[i], exp_lo, exp_hi));
     558           0 : }
     559             : 
     560             : const float min_weight_sum = 1e-10f;
     561             : 
     562           0 : static void weighted_avg_elliott_mul5_m16(const float *w, const int n, float *mstd)
     563             : {
     564           0 :     float vsum = 0.0f, wsum = 0.0f;
     565             :     int i;
     566             : 
     567           0 :     for (i = 0; i < n; i++) {
     568           0 :         vsum += w[i] * (w[n + i] / (1.0f + FFABS(w[n + i])));
     569           0 :         wsum += w[i];
     570             :     }
     571           0 :     if (wsum > min_weight_sum)
     572           0 :         mstd[3] += ((5.0f * vsum) / wsum) * mstd[1] + mstd[0];
     573             :     else
     574           0 :         mstd[3] += mstd[0];
     575           0 : }
     576             : 
     577             : 
     578           0 : static void evalfunc_1(NNEDIContext *s, FrameData *frame_data)
     579             : {
     580           0 :     float *input = frame_data->input;
     581           0 :     float *temp = frame_data->temp;
     582           0 :     float **weights1 = s->weights1;
     583           0 :     const int qual = s->qual;
     584           0 :     const int asize = s->asize;
     585           0 :     const int nns = s->nns;
     586           0 :     const int xdia = s->xdia;
     587           0 :     const int xdiad2m1 = (xdia / 2) - 1;
     588           0 :     const int ydia = s->ydia;
     589           0 :     const float scale = 1.0f / (float)qual;
     590             :     int plane, y, x, i;
     591             : 
     592           0 :     for (plane = 0; plane < s->nb_planes; plane++) {
     593           0 :         const uint8_t *srcp = (const uint8_t *)frame_data->paddedp[plane];
     594           0 :         const int src_stride = frame_data->padded_stride[plane] / sizeof(uint8_t);
     595             : 
     596           0 :         const int width = frame_data->padded_width[plane];
     597           0 :         const int height = frame_data->padded_height[plane];
     598             : 
     599           0 :         uint8_t *dstp = (uint8_t *)frame_data->dstp[plane];
     600           0 :         const int dst_stride = frame_data->dst_stride[plane] / sizeof(uint8_t);
     601             : 
     602           0 :         const int ystart = frame_data->field[plane];
     603           0 :         const int ystop = height - 12;
     604             :         const uint8_t *srcpp;
     605             : 
     606           0 :         if (!(s->process_plane & (1 << plane)))
     607           0 :             continue;
     608             : 
     609           0 :         srcp += (ystart + 6) * src_stride;
     610           0 :         dstp += ystart * dst_stride - 32;
     611           0 :         srcpp = srcp - (ydia - 1) * src_stride - xdiad2m1;
     612             : 
     613           0 :         for (y = ystart; y < ystop; y += 2) {
     614           0 :             for (x = 32; x < width - 32; x++) {
     615             :                 float mstd[4];
     616             : 
     617           0 :                 if (dstp[x] != 255)
     618           0 :                     continue;
     619             : 
     620           0 :                 s->extract((const uint8_t *)(srcpp + x), src_stride, xdia, ydia, mstd, input);
     621           0 :                 for (i = 0; i < qual; i++) {
     622           0 :                     s->dot_prod(s, input, weights1[i], temp, nns * 2, asize, mstd + 2);
     623           0 :                     s->expfunc(temp, nns);
     624           0 :                     s->wae5(temp, nns, mstd);
     625             :                 }
     626             : 
     627           0 :                 dstp[x] = FFMIN(FFMAX((int)(mstd[3] * scale + 0.5f), 0), s->max_value);
     628             :             }
     629           0 :             srcpp += src_stride * 2;
     630           0 :             dstp += dst_stride * 2;
     631             :         }
     632             :     }
     633           0 : }
     634             : 
     635             : #define NUM_NSIZE 7
     636             : #define NUM_NNS 5
     637             : 
     638           0 : static int roundds(const double f)
     639             : {
     640           0 :     if (f - floor(f) >= 0.5)
     641           0 :         return FFMIN((int)ceil(f), 32767);
     642           0 :     return FFMAX((int)floor(f), -32768);
     643             : }
     644             : 
     645           0 : static void select_functions(NNEDIContext *s)
     646             : {
     647           0 :     s->copy_pad = copy_pad;
     648           0 :     s->evalfunc_0 = evalfunc_0;
     649           0 :     s->evalfunc_1 = evalfunc_1;
     650             : 
     651             :     // evalfunc_0
     652           0 :     s->process_line0 = process_line0;
     653             : 
     654           0 :     if (s->pscrn < 2) { // original prescreener
     655           0 :         if (s->fapprox & 1) { // int16 dot products
     656           0 :             s->readpixels = byte2word48;
     657           0 :             s->compute_network0 = compute_network0_i16;
     658             :         } else {
     659           0 :             s->readpixels = pixel2float48;
     660           0 :             s->compute_network0 = compute_network0;
     661             :         }
     662             :     } else { // new prescreener
     663             :         // only int16 dot products
     664           0 :         s->readpixels = byte2word64;
     665           0 :         s->compute_network0 = compute_network0new;
     666             :     }
     667             : 
     668             :     // evalfunc_1
     669           0 :     s->wae5 = weighted_avg_elliott_mul5_m16;
     670             : 
     671           0 :     if (s->fapprox & 2) { // use int16 dot products
     672           0 :         s->extract = extract_m8_i16;
     673           0 :         s->dot_prod = dot_prods;
     674             :     } else { // use float dot products
     675           0 :         s->extract = extract_m8;
     676           0 :         s->dot_prod = dot_prod;
     677             :     }
     678             : 
     679           0 :     s->expfunc = e2_m16;
     680           0 : }
     681             : 
     682           0 : static int modnpf(const int m, const int n)
     683             : {
     684           0 :     if ((m % n) == 0)
     685           0 :         return m;
     686           0 :     return m + n - (m % n);
     687             : }
     688             : 
     689           0 : static int get_frame(AVFilterContext *ctx, int is_second)
     690             : {
     691           0 :     NNEDIContext *s = ctx->priv;
     692           0 :     AVFilterLink *outlink = ctx->outputs[0];
     693           0 :     AVFrame *src = s->src;
     694             :     FrameData *frame_data;
     695           0 :     int effective_field = s->field;
     696             :     size_t temp_size;
     697             :     int field_n;
     698             :     int plane;
     699             : 
     700           0 :     if (effective_field > 1)
     701           0 :         effective_field -= 2;
     702           0 :     else if (effective_field < 0)
     703           0 :         effective_field += 2;
     704             : 
     705           0 :     if (s->field < 0 && src->interlaced_frame && src->top_field_first == 0)
     706           0 :         effective_field = 0;
     707           0 :     else if (s->field < 0 && src->interlaced_frame && src->top_field_first == 1)
     708           0 :         effective_field = 1;
     709             :     else
     710           0 :         effective_field = !effective_field;
     711             : 
     712           0 :     if (s->field > 1 || s->field == -2) {
     713           0 :         if (is_second) {
     714           0 :             field_n = (effective_field == 0);
     715             :         } else {
     716           0 :             field_n = (effective_field == 1);
     717             :         }
     718             :     } else {
     719           0 :         field_n = effective_field;
     720             :     }
     721             : 
     722           0 :     s->dst = ff_get_video_buffer(outlink, outlink->w, outlink->h);
     723           0 :     if (!s->dst)
     724           0 :         return AVERROR(ENOMEM);
     725           0 :     av_frame_copy_props(s->dst, src);
     726           0 :     s->dst->interlaced_frame = 0;
     727             : 
     728           0 :     frame_data = &s->frame_data;
     729             : 
     730           0 :     for (plane = 0; plane < s->nb_planes; plane++) {
     731           0 :         int dst_height = s->planeheight[plane];
     732           0 :         int dst_width = s->linesize[plane];
     733             : 
     734           0 :         const int min_alignment = 16;
     735           0 :         const int min_pad = 10;
     736             : 
     737           0 :         if (!(s->process_plane & (1 << plane))) {
     738           0 :             av_image_copy_plane(s->dst->data[plane], s->dst->linesize[plane],
     739           0 :                                 src->data[plane], src->linesize[plane],
     740             :                                 s->linesize[plane],
     741             :                                 s->planeheight[plane]);
     742           0 :             continue;
     743             :         }
     744             : 
     745           0 :         frame_data->padded_width[plane]  = dst_width + 64;
     746           0 :         frame_data->padded_height[plane] = dst_height + 12;
     747           0 :         frame_data->padded_stride[plane] = modnpf(frame_data->padded_width[plane] + min_pad, min_alignment); // TODO: maybe min_pad is in pixels too?
     748           0 :         if (!frame_data->paddedp[plane]) {
     749           0 :             frame_data->paddedp[plane] = av_malloc_array(frame_data->padded_stride[plane], frame_data->padded_height[plane]);
     750           0 :             if (!frame_data->paddedp[plane])
     751           0 :                 return AVERROR(ENOMEM);
     752             :         }
     753             : 
     754           0 :         frame_data->dstp[plane] = s->dst->data[plane];
     755           0 :         frame_data->dst_stride[plane] = s->dst->linesize[plane];
     756             : 
     757           0 :         if (!frame_data->lcount[plane]) {
     758           0 :             frame_data->lcount[plane] = av_calloc(dst_height, sizeof(int32_t) * 16);
     759           0 :             if (!frame_data->lcount[plane])
     760           0 :                 return AVERROR(ENOMEM);
     761             :         } else {
     762           0 :             memset(frame_data->lcount[plane], 0, dst_height * sizeof(int32_t) * 16);
     763             :         }
     764             : 
     765           0 :         frame_data->field[plane] = field_n;
     766             :     }
     767             : 
     768           0 :     if (!frame_data->input) {
     769           0 :         frame_data->input = av_malloc(512 * sizeof(float));
     770           0 :         if (!frame_data->input)
     771           0 :             return AVERROR(ENOMEM);
     772             :     }
     773             :     // evalfunc_0 requires at least padded_width[0] bytes.
     774             :     // evalfunc_1 requires at least 512 floats.
     775           0 :     if (!frame_data->temp) {
     776           0 :         temp_size = FFMAX(frame_data->padded_width[0], 512 * sizeof(float));
     777           0 :         frame_data->temp = av_malloc(temp_size);
     778           0 :         if (!frame_data->temp)
     779           0 :             return AVERROR(ENOMEM);
     780             :     }
     781             : 
     782             :     // Copy src to a padded "frame" in frame_data and mirror the edges.
     783           0 :     s->copy_pad(src, frame_data, s, field_n);
     784             : 
     785             :     // Handles prescreening and the cubic interpolation.
     786           0 :     s->evalfunc_0(s, frame_data);
     787             : 
     788             :     // The rest.
     789           0 :     s->evalfunc_1(s, frame_data);
     790             : 
     791           0 :     return 0;
     792             : }
     793             : 
     794           0 : static int filter_frame(AVFilterLink *inlink, AVFrame *src)
     795             : {
     796           0 :     AVFilterContext *ctx = inlink->dst;
     797           0 :     AVFilterLink *outlink = ctx->outputs[0];
     798           0 :     NNEDIContext *s = ctx->priv;
     799             :     int ret;
     800             : 
     801           0 :     if ((s->field > 1 ||
     802           0 :          s->field == -2) && !s->second) {
     803             :         goto second;
     804           0 :     } else if (s->field > 1 ||
     805           0 :                s->field == -2) {
     806             :         AVFrame *dst;
     807             : 
     808           0 :         s->src = s->second;
     809           0 :         ret = get_frame(ctx, 1);
     810           0 :         if (ret < 0) {
     811           0 :             av_frame_free(&s->dst);
     812           0 :             av_frame_free(&s->src);
     813           0 :             av_frame_free(&s->second);
     814           0 :             return ret;
     815             :         }
     816           0 :         dst = s->dst;
     817             : 
     818           0 :         if (src->pts != AV_NOPTS_VALUE &&
     819           0 :             dst->pts != AV_NOPTS_VALUE)
     820           0 :             dst->pts += src->pts;
     821             :         else
     822           0 :             dst->pts = AV_NOPTS_VALUE;
     823             : 
     824           0 :         ret = ff_filter_frame(outlink, dst);
     825           0 :         if (ret < 0)
     826           0 :             return ret;
     827           0 :         if (s->eof)
     828           0 :             return 0;
     829           0 :         s->cur_pts = s->second->pts;
     830           0 :         av_frame_free(&s->second);
     831           0 : second:
     832           0 :         if ((s->deint && src->interlaced_frame &&
     833           0 :              !ctx->is_disabled) ||
     834           0 :             (!s->deint && !ctx->is_disabled)) {
     835           0 :             s->second = src;
     836             :         }
     837             :     }
     838             : 
     839           0 :     if ((s->deint && !src->interlaced_frame) || ctx->is_disabled) {
     840           0 :         AVFrame *dst = av_frame_clone(src);
     841           0 :         if (!dst) {
     842           0 :             av_frame_free(&src);
     843           0 :             av_frame_free(&s->second);
     844           0 :             return AVERROR(ENOMEM);
     845             :         }
     846             : 
     847           0 :         if (s->field > 1 || s->field == -2) {
     848           0 :             av_frame_free(&s->second);
     849           0 :             if ((s->deint && src->interlaced_frame) ||
     850           0 :                 (!s->deint))
     851           0 :                 s->second = src;
     852             :         } else {
     853           0 :             av_frame_free(&src);
     854             :         }
     855           0 :         if (dst->pts != AV_NOPTS_VALUE)
     856           0 :             dst->pts *= 2;
     857           0 :         return ff_filter_frame(outlink, dst);
     858             :     }
     859             : 
     860           0 :     s->src = src;
     861           0 :     ret = get_frame(ctx, 0);
     862           0 :     if (ret < 0) {
     863           0 :         av_frame_free(&s->dst);
     864           0 :         av_frame_free(&s->src);
     865           0 :         av_frame_free(&s->second);
     866           0 :         return ret;
     867             :     }
     868             : 
     869           0 :     if (src->pts != AV_NOPTS_VALUE)
     870           0 :         s->dst->pts = src->pts * 2;
     871           0 :     if (s->field <= 1 && s->field > -2) {
     872           0 :         av_frame_free(&src);
     873           0 :         s->src = NULL;
     874             :     }
     875             : 
     876           0 :     return ff_filter_frame(outlink, s->dst);
     877             : }
     878             : 
     879           0 : static int request_frame(AVFilterLink *link)
     880             : {
     881           0 :     AVFilterContext *ctx = link->src;
     882           0 :     NNEDIContext *s = ctx->priv;
     883             :     int ret;
     884             : 
     885           0 :     if (s->eof)
     886           0 :         return AVERROR_EOF;
     887             : 
     888           0 :     ret  = ff_request_frame(ctx->inputs[0]);
     889             : 
     890           0 :     if (ret == AVERROR_EOF && s->second) {
     891           0 :         AVFrame *next = av_frame_clone(s->second);
     892             : 
     893           0 :         if (!next)
     894           0 :             return AVERROR(ENOMEM);
     895             : 
     896           0 :         next->pts = s->second->pts * 2 - s->cur_pts;
     897           0 :         s->eof = 1;
     898             : 
     899           0 :         filter_frame(ctx->inputs[0], next);
     900           0 :     } else if (ret < 0) {
     901           0 :         return ret;
     902             :     }
     903             : 
     904           0 :     return 0;
     905             : }
     906             : 
     907           0 : static av_cold int init(AVFilterContext *ctx)
     908             : {
     909           0 :     NNEDIContext *s = ctx->priv;
     910           0 :     FILE *weights_file = NULL;
     911           0 :     int64_t expected_size = 13574928;
     912             :     int64_t weights_size;
     913             :     float *bdata;
     914             :     size_t bytes_read;
     915           0 :     const int xdia_table[NUM_NSIZE] = { 8, 16, 32, 48, 8, 16, 32 };
     916           0 :     const int ydia_table[NUM_NSIZE] = { 6, 6, 6, 6, 4, 4, 4 };
     917           0 :     const int nns_table[NUM_NNS] = { 16, 32, 64, 128, 256 };
     918           0 :     const int dims0 = 49 * 4 + 5 * 4 + 9 * 4;
     919           0 :     const int dims0new = 4 * 65 + 4 * 5;
     920           0 :     const int dims1 = nns_table[s->nnsparam] * 2 * (xdia_table[s->nsize] * ydia_table[s->nsize] + 1);
     921           0 :     int dims1tsize = 0;
     922           0 :     int dims1offset = 0;
     923           0 :     int ret = 0, i, j, k;
     924             : 
     925           0 :     weights_file = fopen(s->weights_file, "rb");
     926           0 :     if (!weights_file) {
     927           0 :         av_log(ctx, AV_LOG_ERROR, "No weights file provided, aborting!\n");
     928           0 :         return AVERROR(EINVAL);
     929             :     }
     930             : 
     931           0 :     if (fseek(weights_file, 0, SEEK_END)) {
     932           0 :         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the end of weights file.\n");
     933           0 :         fclose(weights_file);
     934           0 :         return AVERROR(EINVAL);
     935             :     }
     936             : 
     937           0 :     weights_size = ftell(weights_file);
     938             : 
     939           0 :     if (weights_size == -1) {
     940           0 :         fclose(weights_file);
     941           0 :         av_log(ctx, AV_LOG_ERROR, "Couldn't get size of weights file.\n");
     942           0 :         return AVERROR(EINVAL);
     943           0 :     } else if (weights_size != expected_size) {
     944           0 :         fclose(weights_file);
     945           0 :         av_log(ctx, AV_LOG_ERROR, "Unexpected weights file size.\n");
     946           0 :         return AVERROR(EINVAL);
     947             :     }
     948             : 
     949           0 :     if (fseek(weights_file, 0, SEEK_SET)) {
     950           0 :         fclose(weights_file);
     951           0 :         av_log(ctx, AV_LOG_ERROR, "Couldn't seek to the start of weights file.\n");
     952           0 :         return AVERROR(EINVAL);
     953             :     }
     954             : 
     955           0 :     bdata = (float *)av_malloc(expected_size);
     956           0 :     if (!bdata) {
     957           0 :         fclose(weights_file);
     958           0 :         return AVERROR(ENOMEM);
     959             :     }
     960             : 
     961           0 :     bytes_read = fread(bdata, 1, expected_size, weights_file);
     962             : 
     963           0 :     if (bytes_read != (size_t)expected_size) {
     964           0 :         fclose(weights_file);
     965           0 :         ret = AVERROR_INVALIDDATA;
     966           0 :         av_log(ctx, AV_LOG_ERROR, "Couldn't read weights file.\n");
     967           0 :         goto fail;
     968             :     }
     969             : 
     970           0 :     fclose(weights_file);
     971             : 
     972           0 :     for (j = 0; j < NUM_NNS; j++) {
     973           0 :         for (i = 0; i < NUM_NSIZE; i++) {
     974           0 :             if (i == s->nsize && j == s->nnsparam)
     975           0 :                 dims1offset = dims1tsize;
     976           0 :             dims1tsize += nns_table[j] * 2 * (xdia_table[i] * ydia_table[i] + 1) * 2;
     977             :         }
     978             :     }
     979             : 
     980           0 :     s->weights0 = av_malloc_array(FFMAX(dims0, dims0new), sizeof(float));
     981           0 :     if (!s->weights0) {
     982           0 :         ret = AVERROR(ENOMEM);
     983           0 :         goto fail;
     984             :     }
     985             : 
     986           0 :     for (i = 0; i < 2; i++) {
     987           0 :         s->weights1[i] = av_malloc_array(dims1, sizeof(float));
     988           0 :         if (!s->weights1[i]) {
     989           0 :             ret = AVERROR(ENOMEM);
     990           0 :             goto fail;
     991             :         }
     992             :     }
     993             : 
     994             :     // Adjust prescreener weights
     995           0 :     if (s->pscrn >= 2) {// using new prescreener
     996             :         const float *bdw;
     997             :         int16_t *ws;
     998             :         float *wf;
     999           0 :         double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
    1000           0 :         int *offt = av_calloc(4 * 64, sizeof(int));
    1001             : 
    1002           0 :         if (!offt) {
    1003           0 :             ret = AVERROR(ENOMEM);
    1004           0 :             goto fail;
    1005             :         }
    1006             : 
    1007           0 :         for (j = 0; j < 4; j++)
    1008           0 :             for (k = 0; k < 64; k++)
    1009           0 :                 offt[j * 64 + k] = ((k >> 3) << 5) + ((j & 3) << 3) + (k & 7);
    1010             : 
    1011           0 :         bdw = bdata + dims0 + dims0new * (s->pscrn - 2);
    1012           0 :         ws = (int16_t *)s->weights0;
    1013           0 :         wf = (float *)&ws[4 * 64];
    1014             :         // Calculate mean weight of each first layer neuron
    1015           0 :         for (j = 0; j < 4; j++) {
    1016           0 :             double cmean = 0.0;
    1017           0 :             for (k = 0; k < 64; k++)
    1018           0 :                 cmean += bdw[offt[j * 64 + k]];
    1019           0 :             mean[j] = cmean / 64.0;
    1020             :         }
    1021             :         // Factor mean removal and 1.0/127.5 scaling
    1022             :         // into first layer weights. scale to int16 range
    1023           0 :         for (j = 0; j < 4; j++) {
    1024           0 :             double scale, mval = 0.0;
    1025             : 
    1026           0 :             for (k = 0; k < 64; k++)
    1027           0 :                 mval = FFMAX(mval, FFABS((bdw[offt[j * 64 + k]] - mean[j]) / 127.5));
    1028           0 :             scale = 32767.0 / mval;
    1029           0 :             for (k = 0; k < 64; k++)
    1030           0 :                 ws[offt[j * 64 + k]] = roundds(((bdw[offt[j * 64 + k]] - mean[j]) / 127.5) * scale);
    1031           0 :             wf[j] = (float)(mval / 32767.0);
    1032             :         }
    1033           0 :         memcpy(wf + 4, bdw + 4 * 64, (dims0new - 4 * 64) * sizeof(float));
    1034           0 :         av_free(offt);
    1035             :     } else { // using old prescreener
    1036           0 :         double mean[4] = { 0.0, 0.0, 0.0, 0.0 };
    1037             :         // Calculate mean weight of each first layer neuron
    1038           0 :         for (j = 0; j < 4; j++) {
    1039           0 :             double cmean = 0.0;
    1040           0 :             for (k = 0; k < 48; k++)
    1041           0 :                 cmean += bdata[j * 48 + k];
    1042           0 :             mean[j] = cmean / 48.0;
    1043             :         }
    1044           0 :         if (s->fapprox & 1) {// use int16 dot products in first layer
    1045           0 :             int16_t *ws = (int16_t *)s->weights0;
    1046           0 :             float *wf = (float *)&ws[4 * 48];
    1047             :             // Factor mean removal and 1.0/127.5 scaling
    1048             :             // into first layer weights. scale to int16 range
    1049           0 :             for (j = 0; j < 4; j++) {
    1050           0 :                 double scale, mval = 0.0;
    1051           0 :                 for (k = 0; k < 48; k++)
    1052           0 :                     mval = FFMAX(mval, FFABS((bdata[j * 48 + k] - mean[j]) / 127.5));
    1053           0 :                 scale = 32767.0 / mval;
    1054           0 :                 for (k = 0; k < 48; k++)
    1055           0 :                     ws[j * 48 + k] = roundds(((bdata[j * 48 + k] - mean[j]) / 127.5) * scale);
    1056           0 :                 wf[j] = (float)(mval / 32767.0);
    1057             :             }
    1058           0 :             memcpy(wf + 4, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
    1059             :         } else {// use float dot products in first layer
    1060           0 :             double half = (1 << 8) - 1;
    1061             : 
    1062           0 :             half /= 2;
    1063             : 
    1064             :             // Factor mean removal and 1.0/half scaling
    1065             :             // into first layer weights.
    1066           0 :             for (j = 0; j < 4; j++)
    1067           0 :                 for (k = 0; k < 48; k++)
    1068           0 :                     s->weights0[j * 48 + k] = (float)((bdata[j * 48 + k] - mean[j]) / half);
    1069           0 :             memcpy(s->weights0 + 4 * 48, bdata + 4 * 48, (dims0 - 4 * 48) * sizeof(float));
    1070             :         }
    1071             :     }
    1072             : 
    1073             :     // Adjust prediction weights
    1074           0 :     for (i = 0; i < 2; i++) {
    1075           0 :         const float *bdataT = bdata + dims0 + dims0new * 3 + dims1tsize * s->etype + dims1offset + i * dims1;
    1076           0 :         const int nnst = nns_table[s->nnsparam];
    1077           0 :         const int asize = xdia_table[s->nsize] * ydia_table[s->nsize];
    1078           0 :         const int boff = nnst * 2 * asize;
    1079           0 :         double *mean = (double *)av_calloc(asize + 1 + nnst * 2, sizeof(double));
    1080             : 
    1081           0 :         if (!mean) {
    1082           0 :             ret = AVERROR(ENOMEM);
    1083           0 :             goto fail;
    1084             :         }
    1085             : 
    1086             :         // Calculate mean weight of each neuron (ignore bias)
    1087           0 :         for (j = 0; j < nnst * 2; j++) {
    1088           0 :             double cmean = 0.0;
    1089           0 :             for (k = 0; k < asize; k++)
    1090           0 :                 cmean += bdataT[j * asize + k];
    1091           0 :             mean[asize + 1 + j] = cmean / (double)asize;
    1092             :         }
    1093             :         // Calculate mean softmax neuron
    1094           0 :         for (j = 0; j < nnst; j++) {
    1095           0 :             for (k = 0; k < asize; k++)
    1096           0 :                 mean[k] += bdataT[j * asize + k] - mean[asize + 1 + j];
    1097           0 :             mean[asize] += bdataT[boff + j];
    1098             :         }
    1099           0 :         for (j = 0; j < asize + 1; j++)
    1100           0 :             mean[j] /= (double)(nnst);
    1101             : 
    1102           0 :         if (s->fapprox & 2) { // use int16 dot products
    1103           0 :             int16_t *ws = (int16_t *)s->weights1[i];
    1104           0 :             float *wf = (float *)&ws[nnst * 2 * asize];
    1105             :             // Factor mean removal into weights, remove global offset from
    1106             :             // softmax neurons, and scale weights to int16 range.
    1107           0 :             for (j = 0; j < nnst; j++) { // softmax neurons
    1108           0 :                 double scale, mval = 0.0;
    1109           0 :                 for (k = 0; k < asize; k++)
    1110           0 :                     mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]));
    1111           0 :                 scale = 32767.0 / mval;
    1112           0 :                 for (k = 0; k < asize; k++)
    1113           0 :                     ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j] - mean[k]) * scale);
    1114           0 :                 wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
    1115           0 :                 wf[(j >> 2) * 8 + (j & 3) + 4] = (float)(bdataT[boff + j] - mean[asize]);
    1116             :             }
    1117           0 :             for (j = nnst; j < nnst * 2; j++) { // elliott neurons
    1118           0 :                 double scale, mval = 0.0;
    1119           0 :                 for (k = 0; k < asize; k++)
    1120           0 :                     mval = FFMAX(mval, FFABS(bdataT[j * asize + k] - mean[asize + 1 + j]));
    1121           0 :                 scale = 32767.0 / mval;
    1122           0 :                 for (k = 0; k < asize; k++)
    1123           0 :                     ws[j * asize + k] = roundds((bdataT[j * asize + k] - mean[asize + 1 + j]) * scale);
    1124           0 :                 wf[(j >> 2) * 8 + (j & 3)] = (float)(mval / 32767.0);
    1125           0 :                 wf[(j >> 2) * 8 + (j & 3) + 4] = bdataT[boff + j];
    1126             :             }
    1127             :         } else { // use float dot products
    1128             :             // Factor mean removal into weights, and remove global
    1129             :             // offset from softmax neurons.
    1130           0 :             for (j = 0; j < nnst * 2; j++) {
    1131           0 :                 for (k = 0; k < asize; k++) {
    1132           0 :                     const double q = j < nnst ? mean[k] : 0.0;
    1133           0 :                     s->weights1[i][j * asize + k] = (float)(bdataT[j * asize + k] - mean[asize + 1 + j] - q);
    1134             :                 }
    1135           0 :                 s->weights1[i][boff + j] = (float)(bdataT[boff + j] - (j < nnst ? mean[asize] : 0.0));
    1136             :             }
    1137             :         }
    1138           0 :         av_free(mean);
    1139             :     }
    1140             : 
    1141           0 :     s->nns = nns_table[s->nnsparam];
    1142           0 :     s->xdia = xdia_table[s->nsize];
    1143           0 :     s->ydia = ydia_table[s->nsize];
    1144           0 :     s->asize = xdia_table[s->nsize] * ydia_table[s->nsize];
    1145             : 
    1146           0 :     s->max_value = 65535 >> 8;
    1147             : 
    1148           0 :     select_functions(s);
    1149             : 
    1150           0 :     s->fdsp = avpriv_float_dsp_alloc(0);
    1151           0 :     if (!s->fdsp)
    1152           0 :         ret = AVERROR(ENOMEM);
    1153             : 
    1154           0 : fail:
    1155           0 :     av_free(bdata);
    1156           0 :     return ret;
    1157             : }
    1158             : 
    1159           0 : static av_cold void uninit(AVFilterContext *ctx)
    1160             : {
    1161           0 :     NNEDIContext *s = ctx->priv;
    1162             :     int i;
    1163             : 
    1164           0 :     av_freep(&s->weights0);
    1165             : 
    1166           0 :     for (i = 0; i < 2; i++)
    1167           0 :         av_freep(&s->weights1[i]);
    1168             : 
    1169           0 :     for (i = 0; i < s->nb_planes; i++) {
    1170           0 :         av_freep(&s->frame_data.paddedp[i]);
    1171           0 :         av_freep(&s->frame_data.lcount[i]);
    1172             :     }
    1173             : 
    1174           0 :     av_freep(&s->frame_data.input);
    1175           0 :     av_freep(&s->frame_data.temp);
    1176           0 :     av_freep(&s->fdsp);
    1177           0 :     av_frame_free(&s->second);
    1178           0 : }
    1179             : 
    1180             : static const AVFilterPad inputs[] = {
    1181             :     {
    1182             :         .name          = "default",
    1183             :         .type          = AVMEDIA_TYPE_VIDEO,
    1184             :         .filter_frame  = filter_frame,
    1185             :         .config_props  = config_input,
    1186             :     },
    1187             :     { NULL }
    1188             : };
    1189             : 
    1190             : static const AVFilterPad outputs[] = {
    1191             :     {
    1192             :         .name          = "default",
    1193             :         .type          = AVMEDIA_TYPE_VIDEO,
    1194             :         .config_props  = config_output,
    1195             :         .request_frame = request_frame,
    1196             :     },
    1197             :     { NULL }
    1198             : };
    1199             : 
    1200             : AVFilter ff_vf_nnedi = {
    1201             :     .name          = "nnedi",
    1202             :     .description   = NULL_IF_CONFIG_SMALL("Apply neural network edge directed interpolation intra-only deinterlacer."),
    1203             :     .priv_size     = sizeof(NNEDIContext),
    1204             :     .priv_class    = &nnedi_class,
    1205             :     .init          = init,
    1206             :     .uninit        = uninit,
    1207             :     .query_formats = query_formats,
    1208             :     .inputs        = inputs,
    1209             :     .outputs       = outputs,
    1210             :     .flags         = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL,
    1211             : };

Generated by: LCOV version 1.13