/* denoise.c (C) nejik 2003 */

/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * References:
 * mjpegtools    http://mjpeg.sourceforge.net/
 * yuvdenoise/denoise.c
 * yuvdenoise/motion.c
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <inttypes.h>

#include "util.h"
#include "csp.h"
#include "denoise.h"

#ifdef ARCH_X86
#include "attributes.h"
#include "mmx.h"
#include "mm_accel.h"
#else
#undef emms
#define emms()
#endif /* ARCH_X86 */

#define Y 0
#define V 1
#define U 2

typedef struct {
  int x;
  int y;
	int x2;
	int y2;
  uint32_t SAD;
  uint32_t SAD2;
} VECTOR;

static uint32_t (*Calc_SAD)  (uint8_t *, uint8_t *, int );
static uint32_t (*Calc_SAD_uv)  (uint8_t *, uint8_t *, int );
static uint32_t (*Calc_SAD_half)  (uint8_t *, uint8_t *, uint8_t *, int );

static int (*Low_contrast_block)(uint8_t *[], uint8_t *[], int, int);
static void (*Subsample_frame)(uint8_t *[], uint8_t *[], int , int );
static void (*Sharpen_frame)(uint8_t *[], uint8_t *[]);
static void (*UV_deflick_block)(uint8_t *[], uint8_t *[], uint8_t *[], int , int , int , VECTOR *);
static void (*Blur_frame)(uint8_t *, uint8_t *, int , int );
static void (*Mv_blocks)(uint8_t *[], uint8_t *[], int , int , int , VECTOR *);
static void (*Avg_frame)(uint8_t *[], uint8_t *[], uint8_t *[], int , int );
static void (*Correct_frame2)(uint8_t *[], uint8_t *[], uint8_t *[], int , int );
static void (*Denoise_frame_pass2)(uint8_t *[], uint8_t *[]);

static int csp = CSP_UNKNOWN;

static uint8_t *bufp[10];
static uint8_t *avg[3];
static uint8_t *avg2[3];
static uint8_t *tmp[3];
static uint8_t *sub2ref[3];
static uint8_t *sub2avg[3];
static uint8_t *sub4ref[3];
static uint8_t *sub4avg[3];
static uint8_t *tfrm[3];

static uint8_t *ref[3];
static uint8_t *refer[3];
static uint8_t *dfrm[3];

static uint8_t *uv_frm[3];
static uint8_t *uv_tfrm[3];

static uint8_t *prev_uv = NULL;
static uint8_t *pre_v = NULL;
static uint8_t *pre_u = NULL;
static short *flag_v = NULL;
static short *flag_u = NULL;
static short *diff_v = NULL;
static short *diff_u = NULL;
static short *flg_v[2];
static short *flg_u[2];
static short *dif_v[2];
static short *dif_u[2];

static VECTOR *vect_map = NULL;
static int vect_col;
static int vect_lin;

static int Uninitialized = 1;
static int size_y;
static int size_uv;
static int offs_v;
static int offs_u;
static int border = 32;
static int Width;
static int Height;
static int Width2;
static int Height2;
static int Width4;
static int Height4;
static int Radius = 8;
/* Threshold capability 0-127 */
static int Threshold = 5;
static int Threshold_pp = 4;
static int Delay = 3;
static int Sharpen = 125;

static int Denoise_uv_flag = 1;
static int Threshold_uv = 10;

static int test_mode = 0;
static int ymask = 0;
static int vmask = 0;
static int umask = 0;

#define WEIGHTBITS 15
#define WEIGHTONE  (1<<WEIGHTBITS)

#ifdef ARCH_X86
static mmx_t whi_lim = {0x00eb00eb00eb00ebLL};
static mmx_t wlo_lim = {0x0010001000100010LL};
static mmx_t w256 = {0x0100010001000100LL};
static mmx_t w2   = {0x0002000200020002LL};
static mmx_t w1   = {0x0001000100010001LL};
static mmx_t w10  = {0x000a000a000a000aLL};
static mmx_t w12  = {0x000c000c000c000cLL};
static mmx_t low_threshold;
static mmx_t sharp;
static mmx_t mdelay;
static mmx_t thre;
static mmx_t per_thre;
//static mmx_t coef_max = {0x0100010001000100LL};
static mmx_t thre_pp;
static mmx_t per_thre_pp;
static mmx_t thre_uv = {0LL};
#endif /* ARCH_X86 */

static void 
Mb_search_44 (uint8_t *avg[], uint8_t *ref[], int x, int y, VECTOR *vect)
{
  uint32_t best_SAD=0x00ffffff;
  uint32_t SAD=0x00ffffff; 
  int radius = Radius>>2;       /* search radius /4 in pixels */
  int32_t  ref_offs;
  int32_t  avg_offs;
  int32_t  ref_offs_uv;
  int32_t  avg_offs_uv;
  int xx, yy;
  int W = Width4;
  int W2 = W / 2;
  uint32_t SAD_v[radius+1];
  uint32_t SAD_u[radius+1];
  int xx_pos, yy_pos, uv_pos;
  uint32_t y_SAD, v_SAD, u_SAD;

//  ref_offs = W * (y>>2) + (x>>2);
//  ref_offs_uv = W2 * (y>>3) + (x>>3);
  ref_offs = W * ((y>>2)-3) + ((x>>2)-3);
  ref_offs_uv = W2 * ((y>>3)-3) + ((x>>3)-3);

  yy_pos = 0;
  for(yy=-radius; yy<=radius; yy++, yy_pos++) {
    xx_pos = uv_pos = 0;
    for(xx=-radius; xx<=radius; xx++, xx_pos++) {
      avg_offs    = ref_offs + (xx) + (yy * W);
      y_SAD = Calc_SAD ( ref[Y] + ref_offs, avg[Y] + avg_offs, W );

      if (yy_pos & 1) {
	v_SAD = SAD_v[xx_pos>>1];
	u_SAD = SAD_u[xx_pos>>1];
      } else {
	if (xx_pos & 1) {
	  v_SAD = SAD_v[uv_pos];
	  u_SAD = SAD_u[uv_pos];
	  uv_pos++;
	} else {
          avg_offs_uv = ref_offs_uv + (xx>>1) + ((yy>>1)*(W2));

          v_SAD = Calc_SAD ( ref[V] + ref_offs_uv, avg[V] + avg_offs_uv, W2 );
          u_SAD = Calc_SAD ( ref[U] + ref_offs_uv, avg[U] + avg_offs_uv, W2 );
	  v_SAD >>= 2;
	  u_SAD >>= 2;
	  SAD_v[uv_pos] = v_SAD;
	  SAD_u[uv_pos] = u_SAD;
	}
      }

      SAD = y_SAD + v_SAD + u_SAD;
      SAD += xx*xx + yy*yy; /* favour center matches... */
        
      if(SAD<=best_SAD) {
        best_SAD = SAD;
        vect->x = xx;
        vect->y = yy;
      }
    }
  }
  vect->SAD = best_SAD;
}

static void 
Mb_search_22 (uint8_t *avg[], uint8_t *ref[], int x, int y, VECTOR *vect)
{
  uint32_t best_SAD=0x00ffffff;
  uint32_t SAD=0x00ffffff; 
  int32_t  ref_offs;
  int32_t  avg_offs;
  int32_t  ref_offs_uv;
  int32_t  avg_offs_uv;
  int xx, yy;
  int vx=vect->x<<1;
  int vy=vect->y<<1;
  uint32_t SAD_v[4]; 
  uint32_t SAD_u[4]; 
  int xx_pos, yy_pos, uv_pos;
  int W = Width2;
  int W2 = W / 2;
  uint32_t y_SAD;
	uint32_t v_SAD, u_SAD;

//  ref_offs = W * (y>>1) + (x>>1);
//  ref_offs_uv = W2 * (y>>2) + (x>>2);
  ref_offs = W * ((y>>1)-2) + ((x>>1)-2);
  ref_offs_uv = W2 * ((y>>2)-3) + ((x>>2)-3);

  yy_pos = 0;
  for(yy=-2; yy<=2; yy++, yy_pos++) {
    xx_pos = 0; uv_pos = 0;
    for(xx=-2; xx<=2; xx++, xx_pos++) {
      avg_offs    = ref_offs + (xx+vx) + ((yy+vy)*W);
      y_SAD = Calc_SAD ( ref[Y] + ref_offs, avg[Y] + avg_offs, W );
      if (yy_pos & 1) {
	v_SAD = SAD_v[xx_pos>>1];
	u_SAD = SAD_u[xx_pos>>1];
      } else {
	if (xx_pos & 1) {
	  v_SAD = SAD_v[uv_pos];
	  u_SAD = SAD_u[uv_pos];
	  uv_pos++;
	} else {
          avg_offs_uv = ref_offs_uv + ((xx+vx)>>1) + (((yy+vy)>>1)*(W2));
          v_SAD = Calc_SAD ( ref[V] + ref_offs_uv, avg[V] + avg_offs_uv, W2 );
          u_SAD = Calc_SAD ( ref[U] + ref_offs_uv, avg[U] + avg_offs_uv, W2 );
	  v_SAD >>= 2;
	  u_SAD >>= 2;
          SAD_v[uv_pos] = v_SAD;
          SAD_u[uv_pos] = u_SAD;
	}
      }
      SAD = y_SAD + v_SAD + u_SAD;
      
      if(SAD<=best_SAD) {
        best_SAD = SAD;
        vect->x = xx+vx;
        vect->y = yy+vy;
      }
    }
  }

  y_SAD = Calc_SAD ( ref[Y] + ref_offs, avg[Y] + ref_offs, W );
  v_SAD = Calc_SAD ( ref[V] + ref_offs_uv, avg[V] + ref_offs_uv, W2 );
  u_SAD = Calc_SAD ( ref[U] + ref_offs_uv, avg[U] + ref_offs_uv, W2 );
  v_SAD >>= 2;
  u_SAD >>= 2;
  SAD = y_SAD + v_SAD + u_SAD;
  if(SAD<=best_SAD) {
    best_SAD = SAD;
    vect->x = 0;
    vect->y = 0;
  }
  vect->SAD = best_SAD;
  vect->SAD2 = best_SAD;
  vect->x2 = vect->x;
  vect->y2 = vect->y;
}

static void 
Mb_search_11 (uint8_t *avg[], uint8_t *ref[], int x, int y, VECTOR *vect)
{
  uint32_t best_SAD=0x00ffffff;
  uint32_t SAD=0x00ffffff; 
  int32_t  ref_offs;
  int32_t  avg_offs;
  int xx, yy;
  int vx=vect->x<<1;
  int vy=vect->y<<1;
  int W = Width;

  ref_offs = W * (y) + (x);

  for(yy=-2; yy<=2; yy++) {
    for(xx=-2; xx<=2; xx++) {
      avg_offs    = ref_offs + (xx+vx) + ((yy+vy)*W);
      SAD = Calc_SAD ( ref[Y] + ref_offs, 
                       avg[Y] + avg_offs, W );
      if(SAD<=best_SAD) {
        best_SAD = SAD;
        vect->x = xx+vx;
        vect->y = yy+vy;
      }
    }
  }

  SAD = Calc_SAD ( ref[Y] + ref_offs, 
                     avg[Y] + ref_offs, W );
  if(SAD<=best_SAD) {
    best_SAD = SAD;
    vect->x = 0;
    vect->y = 0;
  }
  vect->SAD = best_SAD;
}

static void 
Mb_search_00 (uint8_t *avg[], uint8_t *ref[], int x, int y, VECTOR *vect)
{
  uint32_t best_SAD = 0x00ffffff;
  uint32_t SAD;
	int W = Width;
  int ref_offs = W * (y) + (x);
  int avg_offs1;
  int avg_offs2;
  int xx, yy;
  int vx=vect->x;
  int vy=vect->y;
  
  avg_offs1 = ref_offs+(vx)+((vy)*W);

  for(yy=-1;yy<=1;yy++)
    for(xx=-1;xx<=1;xx++)
    {
      avg_offs2 = ref_offs+(vx+xx)+((vy+yy)*W);
      
      SAD = Calc_SAD_half  (ref[Y] + ref_offs, 
                            avg[Y] + avg_offs1,
                            avg[Y] + avg_offs2, W);

      if(SAD<best_SAD)
      {
        best_SAD = SAD;
        vect->x = xx+vx*2;
        vect->y = yy+vy*2;
      }
    }
	vect->SAD = best_SAD;
}

#ifdef ARCH_X86

static uint32_t
Calc_SAD_mmxext(uint8_t *src0, uint8_t *src1, int stride)
{
  register uint32_t a;
  int y;
  
  pxor_r2r(mm0, mm0);
  for (y = 0; y < 8; y++) {
    movq_m2r(*src0, mm1);
    psadbw_m2r(*src1, mm1);
    paddd_r2r(mm1, mm0);
    src0 += stride;
    src1 += stride;
  }
  movd_r2v(mm0, a);

  return a;
}

static uint32_t
Calc_SAD_uv_mmxext(uint8_t *src0, uint8_t *src1, int stride)
{
  uint32_t a;
  int y;
  
  pxor_r2r(mm0, mm0);
  pxor_r2r(mm1, mm1);
  pxor_r2r(mm2, mm2);
  for (y = 0; y < 4; y++) {
    movd_m2r(*src0, mm1);
    movd_m2r(*src1, mm2);
    psadbw_r2r(mm2, mm1);
    paddd_r2r(mm1, mm0);
    src0 += stride;
    src1 += stride;
  }
  movd_r2m(mm0, a);

  return a;
}

static uint32_t
Calc_SAD_half_mmxext(uint8_t *ref, uint8_t *avg1, uint8_t *avg2, int stride) 
{
  register uint32_t a;
  int y;

  pxor_r2r(mm0, mm0);
  for (y = 0; y < 8; y++) {
    movq_m2r(*avg1, mm1);
    pavgb_m2r(*avg2, mm1);
    psadbw_m2r(*ref, mm1);
    paddd_r2r(mm1, mm0);
    avg1 += stride;
    avg2 += stride;
    ref += stride;
  }
  movd_r2v(mm0, a);
  return a;
}

static int
Low_contrast_block_mmxext(uint8_t *re[], uint8_t *av[], int x, int y)
{
  int yy, d;
  uint8_t *rp, *ap;
  int W = Width;
  int uv;
  int max = 0;

  rp = re[Y] + x + W * y;
  ap = av[Y] + x + W * y;
  movq_m2r(low_threshold, mm3);
  pxor_r2r(mm4, mm4);
  pxor_r2r(mm5, mm5);
  for (yy=0; yy < 8; yy++) {
	movq_m2r(*ap, mm0);
	movq_m2r(*rp, mm1);
	movq_r2r(mm0, mm2);
	psubusb_r2r(mm1, mm0);
	psubusb_r2r(mm2, mm1);
	por_r2r(mm1, mm0);
	movq_r2r(mm0, mm1);
	punpcklbw_r2r(mm4, mm0);
	punpckhbw_r2r(mm4, mm1);

	pcmpgtw_r2r(mm3, mm0);
	pcmpgtw_r2r(mm3, mm1);
	psrlw_i2r(15, mm0);
	psrlw_i2r(15, mm1);
	paddw_r2r(mm1, mm0);

	paddw_r2r(mm0, mm5);

	rp += W;
	ap += W;
  }
  movq_r2r(mm5, mm0);
  punpcklwd_r2r(mm4, mm5);
  punpckhwd_r2r(mm4, mm0);
  paddd_r2r(mm5, mm0);
  movq_r2r(mm0, mm1);
  psrlq_i2r(32, mm1);
  paddd_r2r(mm1, mm0);
  movd_r2m(mm0, max);

  W >>= 1;
  x >>= 1;
  y >>= 1;

  rp = re[V] + x + y * W;
  ap = av[V] + x + y * W;
  for (uv = 2; uv > 0; uv--) {
    pxor_r2r(mm5, mm5);
    for (yy=0; yy < 2; yy++) {
	movd_m2r(*ap, mm0);
	ap += W;
	movd_m2r(*ap, mm1);
	ap += W;
	punpckldq_r2r(mm1, mm0);

	movd_m2r(*rp, mm1);
	rp += W;
	movd_m2r(*rp, mm2);
	rp += W;
	punpckldq_r2r(mm2, mm1);

	movq_r2r(mm0, mm2);
	psubusb_r2r(mm1, mm0);
	psubusb_r2r(mm2, mm1);
	por_r2r(mm1, mm0);
	movq_r2r(mm0, mm1);
	punpcklbw_r2r(mm4, mm0);
	punpckhbw_r2r(mm4, mm1);

	pcmpgtw_r2r(mm3, mm0);
	pcmpgtw_r2r(mm3, mm1);
	psrlw_i2r(15, mm0);
	psrlw_i2r(15, mm1);
	paddw_r2r(mm1, mm0);

	paddw_r2r(mm0, mm5);
    }
    movq_r2r(mm5, mm0);
    punpcklwd_r2r(mm4, mm5);
    punpckhwd_r2r(mm4, mm0);
    paddd_r2r(mm5, mm0);
    movq_r2r(mm0, mm1);
    psrlq_i2r(32, mm1);
    paddd_r2r(mm1, mm0);
    movd_r2m(mm0, d);

    max += d;

    rp = re[U] + x + y * W;
    ap = av[U] + x + y * W;
  }

  return ((max>8) ? 0:1);
}

static void
Subsample_frame_mmxext(uint8_t *dst[], uint8_t *src[], int width, int height)
{
  int x, y;
  int w = width;
  int h = height;
  int uv;
  static mmx_t mask = {0x00ff00ff00ff00ffLL};
 
  uint8_t *s  = src[Y];
  uint8_t *s2 = src[Y]+w;
  uint8_t *d  = dst[Y];
  uint8_t *dp, *sp0, *sp1;

  h >>= 1;
  movq_m2r(mask, mm3);
  pxor_r2r(mm4, mm4);
  for (y = 0; y < h; y++) {
    dp = d;
    sp0 = s;
    sp1 = s2;
    for (x = 0; x < w; x += 8) {
	movq_m2r(*sp0, mm0);
	movq_r2r(mm0, mm1);
	pand_r2r(mm3, mm0);
	psrlw_i2r(8, mm1);
	pavgw_r2r(mm0, mm1);

	movq_m2r(*sp1, mm0);
	movq_r2r(mm0, mm2);
	pand_r2r(mm3, mm0);
	psrlw_i2r(8, mm2);
	pavgw_r2r(mm2, mm0);

	pavgw_r2r(mm1, mm0);
	packuswb_r2r(mm4, mm0);
	movd_r2m(mm0, *dp);

	sp0 += 8;
	sp1 += 8;
	dp += 4;
    }
    s+=w<<1;
    s2+=w<<1;
    d+=w>>1;
  }

  w >>= 1;
  h >>= 1;

  s  = src[V];
  s2 = src[V]+w;
  d  = dst[V];

  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < h; y += 1) {
      dp = d;
      sp0 = s;
      sp1 = s2;
      for (x = 0; x < w; x += 8) {
	movq_m2r(*sp0, mm0);
	movq_r2r(mm0, mm1);
	pand_r2r(mm3, mm0);
	psrlw_i2r(8, mm1);
	pavgw_r2r(mm0, mm1);

	movq_m2r(*sp1, mm0);
	movq_r2r(mm0, mm2);
	pand_r2r(mm3, mm0);
	psrlw_i2r(8, mm2);
	pavgw_r2r(mm2, mm0);

	pavgw_r2r(mm1, mm0);
	packuswb_r2r(mm4, mm0);
	movd_r2m(mm0, *dp);

	sp0 += 8;
	sp1 += 8;
	dp += 4;
      }
      s+=w<<1;
      s2+=w<<1;
      d+=w>>1;
    }
    s  = src[U];
    s2 = src[U]+w;
    d  = dst[U];
  }
}

static void
Sharpen_frame_mmxext(uint8_t *dst[], uint8_t *src[])
{
  int H = Height;
  int W = Width;
  int x, y;
  int m, d;
  int sharpen = Sharpen;
  uint8_t *dp, *sp0, *sp1, *sp01, *sp11;

  dp = dst[Y];
  sp0 = src[Y];
  sp1 = src[Y] + W;
  sp01 = sp0+1;
  sp11 = sp1+1;
  movq_m2r(sharp, mm7);
  movq_m2r(whi_lim, mm6);
  movq_m2r(wlo_lim, mm5);
  for (y = 0; y < H; y++) {
    pxor_r2r(mm4, mm4);
    for (x = 0; x < (W-4); x+=4) {
	register int tv;
	movd_m2r(sp0[x], mm0);
	movd_m2r(sp01[x], mm1);
	movd_m2r(sp1[x], mm2);
	movd_m2r(sp11[x], mm3);

	punpcklbw_r2r(mm4, mm0);
	punpcklbw_r2r(mm4, mm1);
	punpcklbw_r2r(mm4, mm2);
	punpcklbw_r2r(mm4, mm3);

	paddw_r2r(mm0, mm1);
	paddw_r2r(mm2, mm1);
	paddw_r2r(mm3, mm1);
	psrlw_i2r(2, mm1);

	psubsw_r2r(mm1, mm0);

	movq_r2r(mm0, mm2);
	pmullw_r2r(mm7, mm0);
	pmulhw_r2r(mm7, mm2);
	movq_r2r(mm0, mm3);
	punpcklwd_r2r(mm2, mm0);
	punpckhwd_r2r(mm2, mm3);

	movd_r2v(mm0, tv);
	tv /= 100;
	movd_v2r(tv, mm2);
	psrlq_i2r(32, mm0);
	movd_r2v(mm0, tv);
	tv /= 100;
	movd_v2r(tv, mm0);
	psllq_i2r(32, mm0);
	por_r2r(mm2, mm0);

	movd_r2v(mm3, tv);
	tv /= 100;
	movd_v2r(tv, mm2);
	psrlq_i2r(32, mm3);
	movd_r2v(mm3, tv);
	tv /= 100;
	movd_v2r(tv, mm3);
	psllq_i2r(32, mm3);
	por_r2r(mm2, mm3);

	packssdw_r2r(mm3, mm0);

	paddsw_r2r(mm1, mm0);

	pminsw_r2r(mm6, mm0);
	pmaxsw_r2r(mm5, mm0);

	packuswb_r2r(mm4, mm0);

	movd_r2m(mm0, dp[x]);
    }

    for (; x < (W-1); x++) {
      m = (sp0[x] + sp0[x+1] + sp1[x] + sp1[x+1]) >> 2;
      d = sp0[x] - m;
      d *= sharpen;
      d /= 100;
      m = m+d;
      m = (m > Y_HI_LIMIT) ? Y_HI_LIMIT : m;
      m = (m < Y_LO_LIMIT) ? Y_LO_LIMIT : m;
      dp[x] = m;
    }
    m = (sp0[x-1] + sp0[x] + sp1[x-1] + sp1[x]) >> 2;
    d = sp0[x] - m;
    d *= sharpen;
    d /= 100;
    m = m+d;
    m = (m > Y_HI_LIMIT) ? Y_HI_LIMIT : m;
    m = (m < Y_LO_LIMIT) ? Y_LO_LIMIT : m;
    dp[x] = m;
    dp += W;
    sp0 = sp1;
    if (y < (H-1))
      sp1 += W;
    sp01 = sp0+1;
    sp11 = sp1+1;
  }
}

static void
//UV_deflick_block_mmxext(uint8_t *dst[], uint8_t *avg[], uint8_t *src[], int x, int y, int width, int height, VECTOR *vec)
UV_deflick_block_mmxext(uint8_t *dst[], uint8_t *avg[], uint8_t *src[], int x, int y, int width, VECTOR *vec)
{
  int W = width / 2;
  int yy, uv;
  uint8_t *dp, *sp, *pp, *ap;
  short *pdf, *cdf, *pfl, *cfl;
  int thr = Threshold_uv;
  int poff = ((x+(vec->x2*2))/2) + ((y+(vec->y2*2))/2) * W;
  int soff = x/2 + y/2*W;

  static mmx_t mask_dst = {0xffffffffffffffffLL};
  static mmx_t mq   = {0LL};
  static mmx_t pv256_mask = {0LL};
  static mmx_t mcfl = {0LL};
  static mmx_t mdst = {0LL};

  dp = dst[V] + soff;
  sp = src[V] + soff;
  pp = pre_v + poff;
  ap = avg[V] + poff;
  pdf = dif_v[1];
  cdf = dif_v[0];
  pfl = flg_v[1];
  cfl = flg_v[0];

  pdf += poff;
  cdf += soff;
  pfl += poff;
  cfl += soff;
  if ((int)(vec->SAD2) > (96*thr)) {
    movq_m2r(w256, mm3);
    pxor_r2r(mm2, mm2);
    for (uv = 2; uv > 0; uv--) {
      for (yy = 0; yy < 4; yy++) {
	movd_m2r(*sp, mm0);
	movd_r2m(mm0, *dp);
	movq_r2m(mm3, *cdf);
	movq_r2m(mm2, *cfl);

	dp += W;
	sp += W;
	cdf += W;
	cfl += W;
      }
      dp = dst[U] + soff;
      sp = src[U] + soff;
      cdf = dif_u[0];
      cfl = flg_u[0];
      cdf += soff;
      cfl += soff;
    }
  } else {
    for (uv = 2; uv > 0; uv--) {
      for (yy = 0; yy < 4; yy++) {
	movq_m2r(mask_dst, mm7);

	movd_m2r(*sp, mm0);
	movd_m2r(*pp, mm1);
	pxor_r2r(mm2, mm2);
	punpcklbw_r2r(mm2, mm0);   // mm0 sp
	punpcklbw_r2r(mm2, mm1);

	movq_r2r(mm1, mm2);
	psubsw_r2r(mm0, mm2);       // mm2 cdf values

	movq_m2r(*pdf, mm3);        // mm3 pdf
	movq_r2r(mm3, mm4);         // copy pdf
	pcmpeqw_m2r(w256, mm4);     // mm4 pdf == 256 mask
		movq_r2m(mm4, pv256_mask); // hold pdf == 256 mask
	movq_r2r(mm3, mm5);         // copy pdf
	paddsw_r2r(mm2, mm5);       // mm5 pdf + cdf
	pandn_r2r(mm5, mm4);        // mm4 q values
		movq_r2m(mm4, mq);        // hold q values

	pavgw_r2r(mm0, mm1);        // mm1 (sp + pp + 1) / 2

	// if (cv <= thr && cv >= -thr)
	movq_r2r(mm2, mm5);				  // mm2 cdf
	paddsw_m2r(thre_uv, mm5);      // mm5 cdf + thre
	pxor_r2r(mm4, mm4);         // mm4 0
	movq_r2r(mm5, mm6);         // copy cdf + thre
	pcmpgtw_r2r(mm4, mm5);      // mm5 (cv > -thr) mask
	pcmpeqw_r2r(mm4, mm6);      // mm6 (cv == -thr) mask
	por_r2r(mm6, mm5);          // mm5 (cv >= -thr) mask
	movq_r2r(mm2, mm4);         // copy cdf
	pcmpgtw_m2r(thre_uv, mm4);     // mm4 (cv > thr) mask
	pandn_r2r(mm5, mm4);        // mm4 (cv <= thr) && (cv >= -thr) mask

	movq_r2r(mm4, mm5);         // copy (cv <= thr) && (cv >= -thr) mask
	pandn_r2r(mm7, mm5);        // mm5 !((cv <= thr) && (cv >= -thr)) mask
	movq_r2r(mm5, mm7);         // mm7 update dst mask

	movq_m2r(*pfl, mm5);        // mm5 pfl
	pcmpeqw_m2r(w1, mm5);       // mm5 pfl==1 mask
	pand_r2r(mm4, mm5);         // mm5 (cv<=thr)&&(cv>=-thr) && pfl==1, mask
	movq_r2r(mm5, mm6);
	pand_m2r(w2, mm6);          // mm6 if (pfl==1) mm6 = 2;
		movq_r2m(mm6, mcfl);      // hold cfl
	movq_r2r(mm5, mm6);         // copy (cv<=thr)&&(cv>=-thr)&&pfl==1, mask
	pand_r2r(mm3, mm5);         // mm3 pdf, mm5 if (pfl==1) mm5 = pdf;
	movq_r2r(mm6, mm3);         // hold (cv<=thr)&&(cv>=-thr)&&pfl==1, mask
	pandn_r2r(mm2, mm6);        // mm6 if !(pfl==1) mm6 = cdf;
	por_r2r(mm5, mm6);          // made cdf
		movq_r2m(mm6, *cdf);      // put cdf
//		movq_r2m(mm6, mcdf);      // hold cdf

	movd_m2r(*ap, mm5);         // mm5 avg pixels in byte
	pxor_r2r(mm6, mm6);         // mm6 0
	punpcklbw_r2r(mm6, mm5);    // mm5 avg pixels in word
	pand_r2r(mm3, mm5);         // (cv<=thr)&&(cv>=-thr)&&pfl==1 mask & ap
	pandn_r2r(mm4, mm3);        // mm3 (pp+sp+1)/2 mask
	movq_r2r(mm1, mm6);         // copy (pp+sp+1)/2
	pand_r2r(mm3, mm6);         // mm6 (pp+sp+1)/2 values
	por_r2r(mm5, mm6);          // made (cv<=thr)&&(cv>=-thr) dst
		movq_r2m(mm6, mdst);      // hold dst

	// if (q <= thr && q >= -thr)
	movq_m2r(mq, mm2);          // mm2 q values
	movq_r2r(mm2, mm3);         // copy q values
	paddsw_m2r(thre_uv, mm3);      // mm3 q + thr
	movq_r2r(mm3, mm5);         // copy q + thr values
	pxor_r2r(mm4, mm4);         // mm4 0
	pcmpgtw_r2r(mm4, mm3);      // mm3 (q > -thr) mask
	pcmpeqw_r2r(mm4, mm5);      // mm5 (q == -thr) mask
	por_r2r(mm5, mm3);          // mm3 (q >= -thr) mask
	pcmpgtw_m2r(thre_uv, mm2);     // mm2 (q > thr) mask
	pandn_r2r(mm3, mm2);        // mm2 (q<=thr) && (q>=-thr) mask
	pand_r2r(mm7, mm2);         // mm2 (q <= thr && q >= -thr) mask

	movq_r2r(mm2, mm5);
	pandn_r2r(mm7, mm5);
	movq_r2r(mm5, mm7);         // update dst mask

	movq_r2r(mm2, mm3);         // copy (q<=thr && q>=-thr) mask
	pand_m2r(w1, mm3);          // mm3 cfl = 1
	por_m2r(mcfl, mm3);         // update cfl
		movq_r2m(mm3, *cfl);      // put cfl
//		movq_r2m(mm3, mcfl);    // hold cfl

	movq_m2r(pv256_mask, mm3);  // mm3 pdf == 256 mask
	pand_r2r(mm2, mm3);         // mm3 (q<=thr && q>=-thr) && pdf==256 mask
	movq_r2r(mm3, mm4);         // copy (q<=thr && q>=-thr) && pdf==256 mask
	pand_r2r(mm0, mm4);         // mm4 sp
	pandn_r2r(mm2, mm3);        // mm3 (q<=thr&&q>=-thr) && !(pdf==256) mask
	pand_r2r(mm1, mm3);         // mm3 (pp+sp+1)/2
	por_r2r(mm4, mm3);          // mm3 (q<=thr&&q>=-thr) dst values
	por_m2r(mdst, mm3);         // mm3 dst values
//	movq_r2m(mm3, mdst);        // update dst values

	// else
	pand_r2r(mm7, mm0);         // mm0 sp & dst mask
	por_r2r(mm3, mm0);          // mm0 dst values
	pxor_r2r(mm1, mm1);
	packuswb_r2r(mm1, mm0);     // word to byte
	movd_r2m(mm0, *dp);         // put dst

	dp += W;
	sp += W;
	pp += W;
	ap += W;
	pdf += W;
	pfl += W;
	cdf += W;
	cfl += W;
      }
      dp = dst[U] + soff;
      sp = src[U] + soff;
      ap = avg[U] + poff;
      pp = pre_u + poff;
      pdf = dif_u[1];
      cdf = dif_u[0];
      pfl = flg_u[1];
      cfl = flg_u[0];
      pdf += poff;
      cdf += soff;
      pfl += poff;
      cfl += soff;
    }
  }
//  emms();
}

static void
Blur_frame_mmxext(uint8_t *dst, uint8_t *src, int width, int height)
{
  int x, y;
  uint8_t *dp;
  uint8_t *sp;
  int W = width;
  int H = height;
  short *l[3], *tp;
  short	*lp0, *lp1, *lp2;
  short lbuf[width * 3];
  uint8_t *sp0, *sp1, *sp2;
  uint8_t *tdp;

  l[0] = lbuf;
  l[1] = lbuf + width;
  l[2] = lbuf + width * 2;

  W -= 1;
  sp = src;
  pxor_r2r(mm3, mm3);
  for (y = 0; y < 2; y++) {
    sp0 = sp;
    sp1 = sp0+1;
    sp2 = sp1+1;
    lp0 = l[y];
    x = 0;
    *lp0 = (*sp0 + *sp1) >> 1;
    x++;
    lp0++;
    for (; x < (W-4); x += 4) {
	movd_m2r(*sp0, mm0);
	movd_m2r(*sp1, mm1);
	movd_m2r(*sp2, mm2);
	punpcklbw_r2r(mm3, mm0);
	punpcklbw_r2r(mm3, mm1);
	punpcklbw_r2r(mm3, mm2);
	paddusw_r2r(mm2, mm0);
	paddusw_r2r(mm1, mm0);
	paddusw_r2r(mm1, mm0);
	psrlw_i2r(2, mm0);
	movq_r2m(mm0, *lp0);
	sp0 += 4;
	sp1 += 4;
	sp2 += 4;
	lp0 += 4;
    }
    for (; x < W; x++) {
      *lp0 = (*sp0 + *sp1 * 2 + *sp2) >> 2;
      sp0++;
      sp1++;
      sp2++;
      lp0++;
    }
    *lp0 = (*sp0 + *sp1) >> 1;
    sp += width;
  }

  dp = dst;
  tdp = dp;
  lp0 = l[0];
  lp1 = l[1];
  for (x = 0; x < width; x+=4) {
    movq_m2r(*lp0, mm0);
    paddusw_m2r(*lp1, mm0);
    psrlw_i2r(1, mm0);
    packuswb_r2r(mm3, mm0);
    movd_r2m(mm0, *tdp);
    tdp += 4;
    lp0 += 4;
    lp1 += 4;
  }
  dp += width;

  H -= 1;
  for (y = 1; y < H; y++) {
    for (x = 0; x < W; x++) {
      sp0 = sp;
      sp1 = sp0+1;
      sp2 = sp1+1;
      lp2 = l[2];
      x = 0;
      *lp2 = (*sp0 + *sp1) >> 1;
      x++;
      lp2++;
      for (; x < (W-4); x += 4) {
	movd_m2r(*sp0, mm0);
	movd_m2r(*sp1, mm1);
	movd_m2r(*sp2, mm2);
	punpcklbw_r2r(mm3, mm0);
	punpcklbw_r2r(mm3, mm1);
	punpcklbw_r2r(mm3, mm2);
	paddusw_r2r(mm2, mm0);
	paddusw_r2r(mm1, mm0);
	paddusw_r2r(mm1, mm0);
	psrlw_i2r(2, mm0);
	movq_r2m(mm0, *lp2);
	sp0 += 4;
	sp1 += 4;
	sp2 += 4;
	lp2 += 4;
      }
      for (; x < W; x++) {
	*lp2 = (*sp0 + *sp1 * 2 + *sp2) >> 2;
	sp0++;
	sp1++;
	sp2++;
	lp2++;
      }
      *lp2 = (*sp0 + *sp1) >> 1;
      sp += width;
    }
    lp0 = l[0];
    lp1 = l[1];
    lp2 = l[2];
    tdp = dp;
    for (x = 0; x < width; x+=4) {
	movq_m2r(*lp0, mm0);
	movq_m2r(*lp1, mm1);
	paddusw_m2r(*lp2, mm0);
	paddusw_r2r(mm1, mm0);
	paddusw_r2r(mm1, mm0);
	psrlw_i2r(2, mm0);
	packuswb_r2r(mm3, mm0);
	movd_r2m(mm0, *tdp);
	lp0 += 4;
	lp1 += 4;
	lp2 += 4;
	tdp += 4;
    }
    dp += width;
    tp = l[0]; l[0] = l[1]; l[1] = l[2]; l[2] = tp;
  }

  tdp = dp;
  lp0 = l[0];
  lp1 = l[1];
  for (x = 0; x < width; x+=4) {
    movq_m2r(*lp0, mm0);
    paddusw_m2r(*lp1, mm0);
    psrlw_i2r(1, mm0);
    packuswb_r2r(mm3, mm0);
    movd_r2m(mm0, *tdp);
    tdp += 4;
    lp0 += 4;
    lp1 += 4;
  }
}

static void
Mv_blocks_mmxext(uint8_t *dst[], uint8_t *avg[], int width, int height, int size, VECTOR *vec)
{
  int W = width;
  int H = height;
  int x, y, yy, uv;
  VECTOR *v = vec;
  uint8_t *ap0, *ap1, *dp;
  int qx, qy, sx, sy;
  int UV;
  int W2, H2, size2;

  size = 8;
  size2 = 4;

  for (y = 0; y < H; y+=size) {
    for (x = 0; x < W; x+=size, v++) {
      qx = v->x/2;
      qy = v->y/2;
      sx = v->x - (qx<<1);
      sy = v->y - (qy<<1);
      dp = dst[Y] + (x) + (y) * W;
      ap0 = avg[Y] + (x+qx) + (y+qy) * W;
      ap1 = avg[Y] + (x+qx+sx) + (y+qy+sy) * W;
      for (yy = 0; yy < size; yy++) {
	movq_m2r(*ap0, mm0);
	pavgb_m2r(*ap1, mm0);
	movq_r2m(mm0, *dp);
	dp += W;
	ap0 += W;
	ap1 += W;
      }
    }
  }
  W2 = W/2;
  H2 = H/2;
  size2 = size/2;
  UV = V;
  for (uv = 2; uv > 0; uv--) {
    v = vec;
    for (y = 0; y < H; y+=size) {
      for (x = 0; x < W; x+=size, v++) {
	qx = v->x/2;
	qy = v->y/2;
	sx = v->x - (qx<<1);
	sy = v->y - (qy<<1);
	ap0 = avg[UV] + (x+qx)/2 + (y+qy)/2 * W2;
	ap1 = avg[UV] + (x+(qx+sx))/2 + (y+(qy+sy))/2 * W2;
	dp = dst[UV] + (x)/2 + (y)/2 * W2;
	for (yy = 0; yy < size2; yy++) {
	  movd_m2r(*ap0, mm0);
	  movd_m2r(*ap1, mm1);
	  pavgb_r2r(mm1, mm0);
	  movd_r2m(mm0, *dp);
	  dp += W2;
	  ap0 += W2;
	  ap1 += W2;
	}
      }
    }
    UV = U;
  }
}

static void
Avg_frame_mmxext(uint8_t *dst[], uint8_t *avg[], uint8_t *ref[], int width, int height)
{
  int W = width;
  int H = height;
  int x, y, uv;
  uint8_t *ap, *dp, *rp;
  int t1 = Delay+1;
  uint8_t *trp, *tap, *tdp;
  int tv, v;

  dp = dst[Y];
  ap = avg[Y];
  rp = ref[Y];
  movq_m2r(mdelay, mm7);
  pxor_r2r(mm6, mm6);
  for (y = 0; y < H; y++) {
    trp = rp;
    tap = ap;
    tdp = dp;
    for (x = 0; x < W; x+=4) {
	movd_m2r(*trp, mm0);
	trp += 4;
	movd_m2r(*tap, mm2);
	tap += 4;
	punpcklbw_r2r(mm6, mm0);
	pxor_r2r(mm1, mm1);
	punpcklbw_r2r(mm2, mm1);
	pmulhuw_r2r(mm7, mm1);
	paddusw_r2r(mm1, mm0);
	movq_r2r(mm0, mm1);
	punpcklwd_r2r(mm6, mm0);
	punpckhwd_r2r(mm6, mm1);

	movd_r2m(mm0, tv);
	tv /= t1;
	v = tv;
	psrlq_i2r(32, mm0);
	movd_r2m(mm0, tv);
	tv /= t1;
	v |= (tv<<8);

	movd_r2m(mm1, tv);
	tv /= t1;
	v |= (tv<<16);
	psrlq_i2r(32, mm1);
	movd_r2m(mm1, tv);
	tv /= t1;
	v |= (tv<<24);

	*((int*)tdp) = v;
	tdp += 4;
    }
    dp += W;
    ap += W;
    rp += W;
  }
  W /= 2;
  H /= 2;
  dp = dst[V];
  ap = avg[V];
  rp = ref[V];
  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < H; y++) {
      trp = rp;
      tap = ap;
      tdp = dp;
      for (x = 0; x < W; x+=4) {
	movd_m2r(*trp, mm0);
	trp += 4;
	movd_m2r(*tap, mm2);
	tap += 4;
	punpcklbw_r2r(mm6, mm0);
	pxor_r2r(mm1, mm1);
	punpcklbw_r2r(mm2, mm1);
	pmulhuw_r2r(mm7, mm1);
	paddusw_r2r(mm1, mm0);
	movq_r2r(mm0, mm1);
	punpcklwd_r2r(mm6, mm0);
	punpckhwd_r2r(mm6, mm1);

	movd_r2m(mm0, tv);
	tv /= t1;
	v = tv;
	psrlq_i2r(32, mm0);
	movd_r2m(mm0, tv);
	tv /= t1;
	v |= (tv<<8);

	movd_r2m(mm1, tv);
	tv /= t1;
	v |= (tv<<16);
	psrlq_i2r(32, mm1);
	movd_r2m(mm1, tv);
	tv /= t1;
	v |= (tv<<24);

	*((int*)tdp) = v;
	tdp += 4;
      }
      dp += W;
      ap += W;
      rp += W;
    }
    dp = dst[U];
    ap = avg[U];
    rp = ref[U];
  }
}  

static void
Correct_frame2_mmxext(uint8_t *dst[], uint8_t *avg[], uint8_t *ref[], int width, int height)
{
  int W = width;
  int H = height;
  int x, y, uv;
  uint8_t *dp, *ap, *rp, *r, *a, *d;

#if 0
  for (x = 0; x < 4; x++) {
    per_thre.uw[x] = 256/Threshold;
  }
#endif

  dp = dst[Y];
  ap = avg[Y];
  rp = ref[Y];
  movq_m2r(thre, mm7);
  movq_m2r(per_thre, mm6);
  movq_m2r(w256, mm5);
  for (y = 0; y < H; y++) {
    r = rp;
    a = ap;
    d = dp;
    for (x = 0; x < W; x+=4) {
	movd_m2r(*r, mm0);    // mm0 ref 4 pixels
	r += 4;
	pxor_r2r(mm4, mm4);
	movd_m2r(*a, mm1);    // mm1 avg 4 pixels
	a += 4;
	punpcklbw_r2r(mm4, mm0); // byte to word
	punpcklbw_r2r(mm4, mm1); // byte to word

	movq_r2r(mm0, mm2);
	movq_r2r(mm1, mm3);
	psubusw_r2r(mm0, mm3);
	psubusw_r2r(mm1, mm2);
	por_r2r(mm3, mm2);       // mm2 4 sad values

	psubusw_r2r(mm7, mm2);   // under threshold to 0
	pmullw_r2r(mm6, mm2);    // mul coefs
	pminsw_r2r(mm5, mm2);

	movq_r2r(mm5, mm3);      // copy coef_max to mm3
	psubusw_r2r(mm2, mm3);   // mm3 avg coefficents

	pmullw_r2r(mm2, mm0);    // mul ref values
	pmullw_r2r(mm3, mm1);    // mul avg values

	paddusw_r2r(mm1, mm0);   // add ref avg
	psrlw_i2r(8, mm0);       // div 256, mm0 result

	pxor_r2r(mm4, mm4);
	packuswb_r2r(mm4, mm0);  // mm0 low dw result 4 pixels in byte

	movd_r2m(mm0, *d);
	d += 4;
    }
    rp += W;
    ap += W;
    dp += W;
  }

  W /= 2;
  H /= 2;
  dp = dst[V];
  ap = avg[V];
  rp = ref[V];
  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < H; y++) {
      r = rp;
      a = ap;
      d = dp;
      for (x = 0; x < W; x+=4) {
	movd_m2r(*r, mm0);    // mm0 ref 4 pixels
	r += 4;
	pxor_r2r(mm4, mm4);
	movd_m2r(*a, mm1);    // mm1 avg 4 pixels
	a += 4;
	punpcklbw_r2r(mm4, mm0); // byte to word
	punpcklbw_r2r(mm4, mm1); // byte to word

	movq_r2r(mm0, mm2);
	movq_r2r(mm1, mm3);
	psubusw_r2r(mm0, mm3);
	psubusw_r2r(mm1, mm2);
	por_r2r(mm3, mm2);       // mm2 4 sad values

	psubusw_r2r(mm7, mm2);   // under threshold to 0
	pmullw_r2r(mm6, mm2);    // mul coefs
	pminsw_r2r(mm5, mm2);

	movq_r2r(mm5, mm3);      // copy coef_max to mm3
	psubusw_r2r(mm2, mm3);   // mm3 avg coefficents

	pmullw_r2r(mm2, mm0);    // mul ref values
	pmullw_r2r(mm3, mm1);    // mul avg values

	paddusw_r2r(mm1, mm0);   // add ref avg
	psrlw_i2r(8, mm0);       // div 256, mm0 result

	pxor_r2r(mm4, mm4);
	packuswb_r2r(mm4, mm0);  // mm0 low dw result 4 pixels in byte

	movd_r2m(mm0, *d);
	d += 4;
      }
      rp += W;
      ap += W;
      dp += W;
    }
    dp = dst[U];
    ap = avg[U];
    rp = ref[U];
  }
}

static void
Denoise_frame_pass2_mmxext (uint8_t *dst[], uint8_t *src[])
{
  int W = Width;
  int H = Height;
  int c, uv;
  uint8_t *dp, *sp;

#if 0
  static mmx_t thre_pp;
  static mmx_t per_thre_pp;

  for (c = 0; c < 4; c++) {
    thre_pp.w[c] = pp_threshold;
  }
  for (c = 0; c < 4; c++) {
    per_thre_pp.w[c] = 256/pp_threshold;
  }
#endif

  /* blend frame with error threshold */

  /* Y */
  dp = dst[Y];
  sp = src[Y];
  movq_m2r(thre_pp, mm7);
  movq_m2r(per_thre_pp, mm6);
  movq_m2r(w256, mm5);
  for (c = 0; c < (W*H); c+=4)
  {
	movd_m2r(*sp, mm0);    // mm0 ref 4 pixels
	sp += 4;
	pxor_r2r(mm4, mm4);
	punpcklbw_r2r(mm4, mm0); // byte to word
	movd_m2r(*dp, mm1);    // mm1 avg 4 pixels
	punpcklbw_r2r(mm4, mm1); // byte to word

	pmullw_m2r(w10, mm1);
	paddusw_r2r(mm1, mm1);   // dst value * 20
	movq_r2r(mm0, mm2);
	pmullw_m2r(w12, mm2);   // src value * 12
	paddusw_r2r(mm2, mm1);
	psrlw_i2r(5, mm1);       // (dst + src) / 32

	movq_r2r(mm0, mm2);
	movq_r2r(mm1, mm3);
	psubusw_r2r(mm0, mm3);
	psubusw_r2r(mm1, mm2);
	por_r2r(mm3, mm2);       // mm2 4 sad values

	pmullw_r2r(mm6, mm2);
	pminsw_r2r(mm5, mm2);
	movq_r2r(mm5, mm3);      // copy coef_max to mm3
	psubusw_r2r(mm2, mm3);   // mm3 avg coefficents

	pmullw_r2r(mm2, mm0);    // mul ref values
	pmullw_r2r(mm3, mm1);    // mul avg values

	paddusw_r2r(mm1, mm0);   // add ref avg
	psrlw_i2r(8, mm0);       // div 256, mm0 result

	pxor_r2r(mm4, mm4);
	packuswb_r2r(mm4, mm0);  // mm0 low dw result 4 pixels in byte

	movd_r2m(mm0, *dp);

	dp += 4;
  }
  
  W >>= 1;
  H >>= 1;

  /* Cr and Cb */
  dp = dst[V];
  sp = src[V];
  for (uv = 2; uv > 0; uv--) {
    for (c = 0; c < (W*H); c+=4) {
	movd_m2r(*sp, mm0);    // mm0 ref 4 pixels
	sp += 4;
	pxor_r2r(mm4, mm4);
	punpcklbw_r2r(mm4, mm0); // byte to word
	movd_m2r(*dp, mm1);    // mm1 avg 4 pixels
	punpcklbw_r2r(mm4, mm1); // byte to word

	pmullw_m2r(w10, mm1);
	paddusw_r2r(mm1, mm1);   // dst value * 20
	movq_r2r(mm0, mm2);
	pmullw_m2r(w12, mm2);   // src value * 12
	paddusw_r2r(mm2, mm1);
	psrlw_i2r(5, mm1);       // (dst + src) / 32

	movq_r2r(mm0, mm2);
	movq_r2r(mm1, mm3);
	psubusw_r2r(mm0, mm3);
	psubusw_r2r(mm1, mm2);
	por_r2r(mm3, mm2);       // mm2 4 sad values

	pmullw_r2r(mm6, mm2);
	pminsw_r2r(mm5, mm2);
	movq_r2r(mm5, mm3);      // copy coef_max to mm3
	psubusw_r2r(mm2, mm3);   // mm3 avg coefficents

	pmullw_r2r(mm2, mm0);    // mul ref values
	pmullw_r2r(mm3, mm1);    // mul avg values

	paddusw_r2r(mm1, mm0);   // add ref avg
	psrlw_i2r(8, mm0);       // div 256, mm0 result

	pxor_r2r(mm4, mm4);
	packuswb_r2r(mm4, mm0);  // mm0 low dw result 4 pixels in byte

	movd_r2m(mm0, *dp);

	dp += 4;
    }
    dp = dst[U];
    sp = src[U];
  }
}

#endif /* ARCH_X86 */

static uint32_t
Calc_SAD_c(uint8_t *src0, uint8_t *src1, int stride)
{
  int dx, dy;
  int d = 0;
  uint32_t a = 0;

  for (dy = 0; dy < 8; dy++) {
    for (dx = 0; dx < 8; dx++) {
      d = src0[dx] - src1[dx];
      a += (d<0) ? -d:d;
    }
    src0 += stride;
    src1 += stride;
  }
  return a;
}

static uint32_t
Calc_SAD_uv_c(uint8_t *src0, uint8_t *src1, int stride)
{
  int dx, dy;
  int d = 0;
  uint32_t a = 0;

  for (dy = 0; dy < 4; dy++) {
    for (dx = 0; dx < 4; dx++) {
      d = src0[dx] - src1[dx];
      a += (d<0) ? -d:d;
    }
    src0 += stride;
    src1 += stride;
  }
  return a;
}

static uint32_t
Calc_SAD_half_c(uint8_t *ref, uint8_t *avg0, uint8_t *avg1, int stride)
{
  int dx, dy;
  int d = 0;
  uint32_t a = 0;

  for (dy = 0; dy < 8; dy++) {
    for (dx = 0; dx < 8; dx++) {
      d = ((avg0[dx] + avg1[dx]) >> 1) - ref[dx];
      a += (d<0) ? -d:d;
    }
    avg0 += stride;
    avg1 += stride;
    ref  += stride;
  }
  return a;
}

static int
Low_contrast_block_c(uint8_t *re[], uint8_t *av[], int x, int y)
{
  int xx, yy, max, d;
  uint8_t *rp, *ap;
  int W = Width;
  int uv;
  int threshold = Threshold;

  max = 0;

  rp = re[Y] + x + W * y;
  ap = av[Y] + x + W * y;
  for (yy=0; yy < 8; yy++) {
    for (xx=0; xx < 8; xx++) {
      d = rp[xx] - ap[xx];
      d = (d<0) ? -d:d;
      max = (d>(2*threshold/3)) ? max+1:max;
      //if (d > (2*threshold/3))
      //  return 0;
    }
    rp += W;
    ap += W;
  }

  W >>= 1;
  x >>= 1;
  y >>= 1;

  rp = re[V] + x + y * W;
  ap = av[V] + x + y * W;
  for (uv = 2; uv > 0; uv--) {
    for (yy=0; yy < 4; yy++) {
      for (xx=0; xx < 4; xx++) {
        d = rp[xx] - ap[xx];
        d = (d<0) ? -d:d;
        max = (d>(2*threshold/3)) ? max+1:max;
      }
      rp += W;
      ap += W;
    }
    rp = re[U] + x + y * W;
    ap = av[U] + x + y * W;
  }

  return ((max>8) ? 0:1);
}

static void
Subsample_frame_c(uint8_t *dst[], uint8_t *src[], int width, int height)
{
  int x, y;
  int w = width;
  int h = height;
  int uv;
 
  uint8_t *s  = src[Y];
  uint8_t *s2 = src[Y]+w;
  uint8_t *d  = dst[Y];

  h >>= 1;
  for (y = 0; y < h; y++) {
    for (x = 0; x < w; x += 2) {
      *(d + (x>>1)) = ( *(s  + x    ) +
		        *(s  + x + 1) + 
		        *(s2 + x    ) +
		        *(s2 + x + 1) )>>2;
    }
    s+=w<<1;
    s2+=w<<1;
    d+=w>>1;
  }

  w >>= 1;
  h >>= 1;

  s  = src[V];
  s2 = src[V]+w;
  d  = dst[V];

  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < h; y += 1) {
      for (x = 0; x < w; x += 2) {
	  *(d + (x>>1)) = ( *(s  + x    ) +
		            *(s  + x + 1) + 
		            *(s2 + x    ) +
		            *(s2 + x + 1) )>>2;
      }
      s+=w<<1;
      s2+=w<<1;
      d+=w>>1;
    }
    s  = src[U];
    s2 = src[U]+w;
    d  = dst[U];
  }
}

static void
Sharpen_frame_c(uint8_t *dst[], uint8_t *src[])
{
  int H = Height;
  int W = Width;
  int x, y;
  int m, d;
  int sharpen = Sharpen;
  uint8_t *dp, *sp0, *sp1;

  dp = dst[Y];
  sp0 = src[Y];
  sp1 = src[Y] + W;
  for (y = 0; y < H; y++) {
    for (x = 0; x < W; x++) {
      m = (sp0[x] + sp0[x+1] + sp1[x] + sp1[x+1]) / 4;
      d = sp0[x] - m;
      d *= sharpen;
      d /= 100;
      m = m+d;
      m = (m > Y_HI_LIMIT) ? Y_HI_LIMIT : m;
      m = (m < Y_LO_LIMIT) ? Y_LO_LIMIT : m;
      dp[x] = m;
    }
    dp += W;
    sp0 = sp1;
    if (y < (H-1))
      sp1 += W;
  }
}

static void
//UV_deflick_block_c(uint8_t *dst[], uint8_t *avg[], uint8_t *src[], int x, int y, int width, int height, VECTOR *vec)
UV_deflick_block_c(uint8_t *dst[], uint8_t *avg[], uint8_t *src[], int x, int y, int width, VECTOR *vec)
{
  int W = width / 2;
  int xx, yy, uv, cv, pv;
  uint8_t *dp, *sp, *pp, *ap;
  short *pdf, *cdf, *pfl, *cfl;
  int thr = Threshold_uv;
  int q;
  int poff = ((x+(vec->x2*2))/2) + ((y+(vec->y2*2))/2) * W;
  int soff = x/2 + y/2*W;

  dp = dst[V] + soff;
  sp = src[V] + soff;
  pp = pre_v + poff;
  ap = avg[V] + poff;
  pdf = dif_v[1];
  cdf = dif_v[0];
  pfl = flg_v[1];
  cfl = flg_v[0];

  pdf += poff;
  cdf += soff;
  pfl += poff;
  cfl += soff;
  if ((int)(vec->SAD2) > (96*thr)) {
    for (uv = 2; uv > 0; uv--) {
      for (yy = 0; yy < 4; yy++) {
        for (xx = 0; xx < 4; xx++) {
	  dp[xx] = sp[xx];
	  cdf[xx] = 256;
	  cfl[xx] = 0;
	}
	dp += W;
	sp += W;
	cdf += W;
	cfl += W;
      }
      dp = dst[U] + soff;
      sp = src[U] + soff;
      cdf = dif_u[0];
      cfl = flg_u[0];
      cdf += soff;
      cfl += soff;
    }
  } else {
    for (uv = 2; uv > 0; uv--) {
      for (yy = 0; yy < 4; yy++) {
	for (xx = 0; xx < 4; xx++) {
	  cv = (int)pp[xx] - (int)sp[xx];
	  pv = pdf[xx];
	  if (pv == 256)
	    q = 0;
	  else
	    q = pv + cv;

	  if (cv <= thr && cv >= -thr) {
	    if (pfl[xx] == 1) {
	      dp[xx] = ap[xx];
	      cfl[xx] = 2;
	      cv = pv;
	    } else {
	      dp[xx] = (pp[xx] + sp[xx] + 1) / 2;
	      cfl[xx] = 0;
	    }
	  } else if ((q <= thr) && (q >= -thr)) {
	    if (pv == 256) {
	      dp[xx] = sp[xx];
	    } else {
	      dp[xx] = (pp[xx] + sp[xx] + 1) / 2;
	    }
	    cfl[xx] = 1;
	  } else {
	    dp[xx] = sp[xx];
	    cfl[xx] = 0;
	  }
	  cdf[xx] = cv;
	}
	dp += W;
	sp += W;
	pp += W;
	ap += W;
	pdf += W;
	pfl += W;
	cdf += W;
	cfl += W;
      }
      dp = dst[U] + soff;
      sp = src[U] + soff;
      ap = avg[U] + poff;
      pp = pre_u + poff;
      pdf = dif_u[1];
      cdf = dif_u[0];
      pfl = flg_u[1];
      cfl = flg_u[0];
      pdf += poff;
      cdf += soff;
      pfl += poff;
      cfl += soff;
    }
  }
}

#if 0
static void
UV_deflick_reset(int width, int height)
{
	int W2 = width/2;
	int H2 = height/2;
	int c;
	int buf_size = W2 * (border + H2);

	for (c = 0; c < (buf_size*2); c++) {
		flag_v[c] = 0;
		flag_u[c] = 0;
		diff_v[c] = 0;
		diff_u[c] = 0;
	}
}
#endif

static void
Blur_frame_c(uint8_t *dst, uint8_t *src, int width, int height)
{
  int x, y;
  uint8_t *dp;
  uint8_t *sp;
  int W = width;
  int H = height;
  int *l[3], *tp;
  int lbuf[width * 3];

  l[0] = lbuf;
  l[1] = lbuf + width;
  l[2] = lbuf + width * 2;

  W -= 1;
  sp = src;
  for (y = 0; y < 2; y++) {
    x = 0;
    l[y][x] = (sp[0] + sp[1]) >> 1;
    for (x++; x < W; x++) {
      l[y][x] = (sp[x-1] + sp[x]*2 + sp[x+1]) >> 2;
    }
    l[y][x] = (sp[x-1] + sp[x]) >> 1;
    sp += width;
  }

  dp = dst;
  for (x = 0; x < width; x++) {
    dp[x] = (l[0][x] + l[1][x] ) >> 1;
  }
  dp += width;

  H -= 1;
  for (y = 1; y < H; y++) {
    for (x = 0; x < W; x++) {
      x = 0;
      l[2][x] = (sp[0] + sp[1]) >> 1;
      for (x++; x < W; x++) {
        l[2][x] = (sp[x-1] + sp[x]*2 + sp[x+1]) >> 2;
      }
      l[2][x] = (sp[x-1] + sp[x]) >> 1;
      sp += width;
    }
    for (x = 0; x < width; x++) {
      dp[x] = (l[0][x] + l[1][x]*2 + l[2][x] ) >> 2;
    }
    dp += width;
    tp = l[0]; l[0] = l[1]; l[1] = l[2]; l[2] = tp;
  }

  for (x = 0; x < width; x++) {
    dp[x] = (l[0][x] + l[1][x] ) >> 1;
  }
}

static void
Mv_blocks_c(uint8_t *dst[], uint8_t *avg[], int width, int height, int size, VECTOR *vec)
{
  int W = width;
  int H = height;
  int x, y, xx, yy, uv;
  VECTOR *v = vec;
  uint8_t *ap0, *ap1, *dp;
  int qx, qy, sx, sy;
  int UV;
  int W2, H2, size2;

  for (y = 0; y < H; y+=size) {
    for (x = 0; x < W; x+=size, v++) {
      qx = v->x/2;
      qy = v->y/2;
      sx = v->x - (qx<<1);
      sy = v->y - (qy<<1);
      dp = dst[Y] + (x) + (y) * W;
      ap0 = avg[Y] + (x+qx) + (y+qy) * W;
      ap1 = avg[Y] + (x+qx+sx) + (y+qy+sy) * W;
      for (yy = 0; yy < size; yy++) {
	for (xx = 0; xx < size; xx++) {
	  dp[xx] = (ap0[xx] + ap1[xx]) >> 1;
	}
	dp += W;
	ap0 += W;
	ap1 += W;
      }
    }
  }
  W2 = W/2;
  H2 = H/2;
  size2 = size/2;
  UV = V;
  for (uv = 2; uv > 0; uv--) {
    v = vec;
    for (y = 0; y < H; y+=size) {
      for (x = 0; x < W; x+=size, v++) {
	qx = v->x/2;
	qy = v->y/2;
	sx = v->x - (qx<<1);
	sy = v->y - (qy<<1);
	ap0 = avg[UV] + (x+qx)/2 + (y+qy)/2 * W2;
	ap1 = avg[UV] + (x+(qx+sx))/2 + (y+(qy+sy))/2 * W2;
	dp = dst[UV] + (x)/2 + (y)/2 * W2;
	for (yy = 0; yy < size2; yy++) {
	  for (xx = 0; xx < size2; xx++) {
	    dp[xx] = (ap0[xx] + ap1[xx]) >> 1;
	  }
	  dp += W2;
	  ap0 += W2;
	  ap1 += W2;
	}
      }
    }
    UV = U;
  }
}  

static void
Avg_frame_c(uint8_t *dst[], uint8_t *avg[], uint8_t *ref[], int width, int height)
{
  int W = width;
  int H = height;
  int x, y, uv;
  uint8_t *ap, *dp, *rp;
  int t = Delay;
  int t1 = t+1;

  dp = dst[Y];
  ap = avg[Y];
  rp = ref[Y];
  for (y = 0; y < H; y++) {
    for (x = 0; x < W; x++) {
      dp[x] = (ap[x]*t + rp[x]) / t1;
    }
    dp += W;
    ap += W;
    rp += W;
  }
  W /= 2;
  H /= 2;
  dp = dst[V];
  ap = avg[V];
  rp = ref[V];
  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < H; y++) {
      for (x = 0; x < W; x++) {
	dp[x] = (ap[x] * t + rp[x]) / t1;
      }
      dp += W;
      ap += W;
      rp += W;
    }
    dp = dst[U];
    ap = avg[U];
    rp = ref[U];
  }
}  

static void
Correct_frame(uint8_t *dst[], uint8_t *avg[], uint8_t *ref[], int width, int height)
{
  int W = width;
  int H = height;
  int threshold = Threshold;
  int threshold2 = threshold * 2;
  int f1, f2, av, rv, x, y, uv, d, q;
  uint8_t *dp, *ap, *rp;

  dp = dst[Y];
  ap = avg[Y];
  rp = ref[Y];
  for (y = 0; y < H; y++) {
    for (x = 0; x < W; x++) {
      rv = rp[x];
      av = ap[x];
      d = rv - av;
      q = (d<0) ? -d:d;

      if (q > threshold2) {
        dp[x] = rv;
      } else if (q > threshold) {
        f1 = (256*(q-threshold))/threshold;
        f1 = (f1>256) ? 256:f1;
        f1 = (f1<0)   ?   0:f1;
        f2 = 256-f1;
        dp[x] = (av * f2 + rv * f1) / 256;
      } else {
	dp[x] = av;
      }
    }
    rp += W;
    ap += W;
    dp += W;
  }

  W /= 2;
  H /= 2;
  dp = dst[V];
  ap = avg[V];
  rp = ref[V];
  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < H; y++) {
      for (x = 0; x < W; x++) {
	rv = rp[x];
	av = ap[x];
	d = rv - av;
	q = (d<0) ? -d:d;

	if (q > threshold2) {
	  dp[x] = (rp[x] + rp[x-W] + rp[x+W]) / 3;
	} else if (q > threshold) {
	  f1 = (256*(q-threshold))/threshold;
	  f1 = (f1>256) ? 256:f1;
	  f1 = (f1<0)   ?   0:f1;
	  f2 = 256-f1;
	  if (y > 1 && y < (H-1)) {
	    dp[x] = ((rp[x] + rp[x-W] + rp[x+W]) * f1 / 3 +
		     (ap[x] + ap[x-W] + ap[x+W]) * f2 / 3) / 256;
	  } else
	    dp[x] = (av * f2 + rv * f1) / 256;
	} else {
	  dp[x] = av;
	}
      }
      rp += W;
      ap += W;
      dp += W;
    }
    dp = dst[U];
    ap = avg[U];
    rp = ref[U];
  }
}

static void
Correct_frame2_c(uint8_t *dst[], uint8_t *avg[], uint8_t *ref[], int width, int height)
{
  int W = width;
  int H = height;
  int threshold = Threshold;
  int f1, f2, av, rv, x, y, uv, d, q;
  uint8_t *dp, *ap, *rp;
  int per_thr = WEIGHTONE/threshold;

  dp = dst[Y];
  ap = avg[Y];
  rp = ref[Y];
  for (y = 0; y < H; y++) {
    for (x = 0; x < W; x++) {
      rv = rp[x];
      av = ap[x];
      d = rv - av;
      q = (d<0) ? -d:d;
      q -= threshold;

      if (q >= threshold) {
        dp[x] = rv;
      } else if (q > 0) {
	f1 = q * per_thr;
	f1 >>= (WEIGHTBITS-8);
//        f1 = (f1>256) ? 256:f1;
//        f1 = (f1<0)   ?   0:f1;
        f2 = 256-f1;
        dp[x] = (av * f2 + rv * f1) / 256;
      } else {
	dp[x] = av;
      }
    }
    rp += W;
    ap += W;
    dp += W;
  }

  W /= 2;
  H /= 2;
  dp = dst[V];
  ap = avg[V];
  rp = ref[V];
  for (uv = 2; uv > 0; uv--) {
    for (y = 0; y < H; y++) {
      for (x = 0; x < W; x++) {
	rv = rp[x];
	av = ap[x];
        d = rv - av;
        q = (d<0) ? -d:d;
	q -= threshold;

        if (q >= threshold) {
          dp[x] = rv;
        } else if (q > 0) {
	  f1 = q * per_thr;
	  f1 >>= (WEIGHTBITS-8);
//	    f1 = (f1>256) ? 256:f1;
//	    f1 = (f1<0)   ?   0:f1;
	  f2 = 256-f1;
	  dp[x] = (av * f2 + rv * f1) / 256;
	} else {
	  dp[x] = av;
        }
      }
      rp += W;
      ap += W;
      dp += W;
    }
    dp = dst[U];
    ap = avg[U];
    rp = ref[U];
  }
}

static void
Denoise_frame_pass2_c(uint8_t *dst[], uint8_t *src[])
{
  int d;
  int c;
  int f1=0;
  int f2=0;
  int W = Width;
  int H = Height;
  int uv;
  int pp_threshold = Threshold_pp;
  uint8_t *dp, *sp;

  /* blend frame with error threshold */

  /* Y */
  dp = dst[Y];
  sp = src[Y];
  for (c = 0; c < (W*H); c++)
  {
    *dp = ( (*dp) * 2 + (*sp) )/3;
      
    d = *dp - *sp;
    d = (d<0)? -d:d;

    f1 = (255*d)/pp_threshold;
    f1 = (f1>255)? 255:f1;
    f1 = (f1<0)?     0:f1;
    f2 = 255-f1;
    
    *dp = ( (*sp)*f1 + (*dp)*f2 )/255;

    dp++;
    sp++;
  }
  
  W >>= 1;
  H >>= 1;

  /* Cr and Cb */
  dp = dst[V];
  sp = src[V];
  for (uv = 2; uv > 0; uv--) {
    for (c = 0; c < (W*H); c++) {
      *dp = ( (*dp) * 2 + (*sp) )/3;      
      d = *dp - *sp;
      d = (d<0)? -d:d;
    
      f1 = (255*(d-pp_threshold))/pp_threshold;
      f1 = (f1>255)? 255:f1;
      f1 = (f1<0)?     0:f1;
      f2 = 255-f1;
      *dp =( (*sp)*f1 + (*dp)*f2 )/255;

      dp++;
      sp++;
    }
    dp = dst[U];
    sp = src[U];
  }
}

static void
Copy_frame(uint8_t *dst[], uint8_t *src[], int dst_width, int dst_height, int src_width, int src_height)
{
  int y, w, h, uv;
  uint8_t *dp, *sp;

  h = (dst_height < src_height) ? dst_height : src_height;
  w = (dst_width  < src_width ) ? dst_width  : src_width;

  dp = dst[Y];
  sp = src[Y];
  for (y = 0; y < h; y++) {
    memcpy(dp, sp, w);
    dp += dst_width;
    sp += src_width;
  }
  src_width /= 2;
  dst_width /= 2;
  h /= 2;
  w /= 2;
  dp = dst[V];
  sp = src[V];
  for (uv = 2; uv>0; uv--) {
    for (y = 0; y < h; y++) {
      memcpy(dp, sp, w);
      dp += dst_width;
      sp += src_width;
    }
    dp = dst[U];
    sp = src[U];
  }
}

static void
Denoise_frame(void)
{
  int W, H;
  int x, y;
//  int block_num = Width/8 * Height/8;
  int count = 0;
  int low_count = 0;
	short *tp;
  VECTOR *vec = vect_map;

  W = Width;
  H = Height;

  if (test_mode == 1) {
    goto test1skip;
  }

  ref[Y] = refer[Y];
  ref[V] = refer[V];
  ref[U] = refer[U];
#if 0
  ref[Y] = refer[Y];
  if (Denoise_uv_flag) {
    Vert_blur(uv_tfrm[V], refer[V], W/2, H/2, 3);
    Vert_blur(uv_tfrm[U], refer[U], W/2, H/2, 3);
    Horz_blur(uv_frm[V], uv_tfrm[V], W/2, H/2, 3);
    Horz_blur(uv_frm[U], uv_tfrm[U], W/2, H/2, 3);
    ref[V] = uv_frm[V];
    ref[U] = uv_frm[U];
  } else {
    ref[V] = refer[V];
    ref[U] = refer[U];
  }
#endif

  Subsample_frame(sub2ref, ref, W, H);
  Subsample_frame(sub4ref, sub2ref, W/2, H/2);
  Subsample_frame(sub2avg, avg, W, H);
  Subsample_frame(sub4avg, sub2avg, W/2, H/2);

  count = 0;
  vec = vect_map;
  for (y=0; y < H; y+= 8) {
    for (x=0; x < W; x+= 8, vec++) {
      vec->x = vec->y = 0;
      vec->x2 = vec->y2 = 0;
      vec->SAD = UINT_MAX;

      if (Low_contrast_block(avg, ref, x, y)) {
        vec->x = vec->y = 0;
        vec->x2 = vec->y2 = 0;
        vec->SAD = 0;
        low_count++;
      } else {
   	Mb_search_44(sub4avg, sub4ref, x, y, vec);
    	Mb_search_22(sub2avg, sub2ref, x, y, vec);
      }
      if (vec->SAD > (64*255/4/2))
        count++;
    }
  }
#if 0
  if (Denoise_uv_flag && (count > (block_num/3))) {
    UV_deflick_reset(W,H);
  }
#endif

  if (Denoise_uv_flag) {
    vec = vect_map;
    for (y=0; y < H; y += 8) {
      for (x=0; x < W; x+= 8, vec++) {
        UV_deflick_block(uv_tfrm, avg, refer, x, y, W, vec);
      }
    }
#if 0
    Vert_blur(uv_tfrm[V], uv_frm[V], W/2, H/2, 1);
    Vert_blur(uv_tfrm[U], uv_frm[U], W/2, H/2, 1);
    Horz_blur(uv_frm[V], uv_tfrm[V], W/2, H/2, 1);
    Horz_blur(uv_frm[U], uv_tfrm[U], W/2, H/2, 1);
#endif
    Blur_frame(uv_frm[V], uv_tfrm[V], W/2, H/2);
    Blur_frame(uv_frm[U], uv_tfrm[U], W/2, H/2);
    ref[V] = uv_frm[V];
    ref[U] = uv_frm[U];
  } else {
    ref[V] = refer[V];
    ref[U] = refer[U];
  }

  memcpy(tfrm[Y], refer[Y], size_y);
  ref[Y] = tfrm[Y];

  vec = vect_map;
  for (y=0; y < H; y+= 8) {
    for (x=0; x < W; x+= 8, vec++) {
      if (vec->SAD == 0 && vec->x == 0 && vec->y == 0)
	continue;
      Mb_search_11(avg, ref, x, y, vec);
      Mb_search_00(avg, ref, x, y, vec);
    }
  }

  Mv_blocks(tmp, avg, W, H, 8, vect_map);
  Avg_frame(tmp, tmp, ref, W, H);
  if (Denoise_uv_flag)
    Correct_frame2(avg, tmp, ref, W, H);
  else
    Correct_frame(avg, tmp, ref, W, H);

test1skip:
  if (test_mode == 0) {
    Denoise_frame_pass2(avg2, avg);
    Sharpen_frame(avg2, avg2);
    Copy_frame(dfrm, avg2, W, H, W, H);
  } else {
    switch (test_mode) {
      case 1:
        Copy_frame(dfrm, refer, W, H, W, H);
        break;
      case 2:
        Copy_frame(dfrm, ref, W, H, W, H);
        break;
      case 3:
        Copy_frame(dfrm, avg, W, H, W, H);
        break;
      case 5:
        Copy_frame(dfrm, tmp, W, H, W, H);
        break;
      default:
      case 4:
        Denoise_frame_pass2(avg2, avg);
        Sharpen_frame(avg2, avg2);
        Copy_frame(dfrm, avg2, W, H, W, H);
        break;
    }
    if (ymask)
      memset(dfrm[Y], 16, size_y);
    if (umask)
      memset(dfrm[U], 128, size_uv);
    if (vmask)
      memset(dfrm[V], 128, size_uv);
  }

//  Copy_frame(dfrm, sub2ref, W, H, W/2, H/2);
//  Copy_frame(dfrm, sub4ref, W, H, W/4, H/4);
  if (Denoise_uv_flag) {
    memcpy(pre_v, refer[V], size_uv);
    memcpy(pre_u, refer[U], size_uv);
    tp = dif_v[0]; dif_v[0] = dif_v[1]; dif_v[1] = tp;
    tp = dif_u[0]; dif_u[0] = dif_u[1]; dif_u[1] = tp;
    tp = flg_v[0]; flg_v[0] = flg_v[1]; flg_v[1] = tp;
    tp = flg_u[0]; flg_u[0] = flg_u[1]; flg_u[1] = tp;
  }

  emms();
}

static void
Initialize(uint8_t *src, int width, int height)
{
  int size = width * (height + border*2) * 12 / 8;
  int y0_offset;
  int uv0_offset;
  int sub2size;
  int sub4size;
  int vect_num;
  int W, H, W2, H2;
  int buf_size;
  int c;
#ifdef ARCH_X86
  uint32_t cpu_accel;
  int x;
#endif /* ARCH_X86 */

  vect_col = width / 8;
  vect_lin = height / 8;
  vect_num = vect_col * vect_lin;
  vect_map =(VECTOR*)mem_realloc(vect_map,sizeof(VECTOR)*vect_num, NULL);

  Width = width;
  Height = height;
  Width2 = width / 2;
  Height2 = height / 2;
  Width4 = width / 4;
  Height4 = height / 4;

  W = Width;
  H = Height;
  W2 = Width/2;
  H2 = Height/2;

  size_y = width * height;
  size_uv = size_y / 4;
  offs_v = width * height;
  offs_u = offs_v + (offs_v / 4);

  buf_size = W2 * (border + H2);
  prev_uv = mem_realloc(prev_uv, buf_size*2, NULL);
  pre_v = prev_uv + W2 * border / 2;
  pre_u = prev_uv + buf_size + W2 * border / 2;
  flag_v = (short*) mem_realloc(flag_v, (buf_size*2) * sizeof(short), NULL);
  flag_u = (short*) mem_realloc(flag_u, (buf_size*2) * sizeof(short), NULL);
  diff_v = (short*) mem_realloc(diff_v, (buf_size*2) * sizeof(short), NULL);
  diff_u = (short*) mem_realloc(diff_u, (buf_size*2) * sizeof(short), NULL);
  flg_v[0] = flg_v[1] = flag_v;
  flg_v[0] += W2*border/2;
  flg_v[1] += buf_size + W2*border/2;
  flg_u[0] = flg_u[1] = flag_u;
  flg_u[0] += W2*border/2;
  flg_u[1] += buf_size + W2*border/2;

  dif_v[0] = dif_v[1] = diff_v;
  dif_v[0] += W2*border/2;
  dif_v[1] += buf_size + W2*border/2;
  dif_u[0] = dif_u[1] = diff_u;
  dif_u[0] += W2*border/2;
  dif_u[1] += buf_size + W2*border/2;

  bufp[8] = mem_realloc(bufp[8], buf_size*2, NULL);
  uv_frm[V] = bufp[8] + W2*border/2;
  uv_frm[U] = bufp[8] + buf_size + W2*border/2;

  bufp[9] = mem_realloc(bufp[9], buf_size*2, NULL);
  uv_tfrm[V] = bufp[9] + W2*border/2;
  uv_tfrm[U] = bufp[9] + buf_size + W2*border/2;

  for (c = 0; c < (buf_size*2); c++) {
    flag_v[c] = 0;
    flag_u[c] = 0;
    diff_v[c] = 0;
    diff_u[c] = 0;
  }

  bufp[0] = mem_realloc(bufp[0], size, NULL);
  bufp[1] = mem_realloc(bufp[1], size, NULL);
  bufp[2] = mem_realloc(bufp[2], size, NULL);
  bufp[3] = mem_realloc(bufp[3], size, NULL);
  bufp[4] = mem_realloc(bufp[4], size/4, NULL);
  bufp[5] = mem_realloc(bufp[5], size/4, NULL);
  bufp[6] = mem_realloc(bufp[6], size/16, NULL);
  bufp[7] = mem_realloc(bufp[7], size/16, NULL);

  y0_offset = width * border;
  uv0_offset = (width/2) * (border/2);

  tfrm[Y] = bufp[0] + y0_offset;
  tfrm[V] = bufp[0] + (size / 6 * 4) + uv0_offset;
  tfrm[U] = bufp[0] + (size / 6 * 5) + uv0_offset;

  avg[Y] = bufp[1] + y0_offset;
  avg[V] = bufp[1] + (size / 6 * 4) + uv0_offset;
  avg[U] = bufp[1] + (size / 6 * 5) + uv0_offset;

  avg2[Y] = bufp[2] + y0_offset;
  avg2[V] = bufp[2] + (size / 6 * 4) + uv0_offset;
  avg2[U] = bufp[2] + (size / 6 * 5) + uv0_offset;

  tmp[Y] = bufp[3] + y0_offset;
  tmp[V] = bufp[3] + (size / 6 * 4) + uv0_offset;
  tmp[U] = bufp[3] + (size / 6 * 5) + uv0_offset;

  sub2size = size / 4;
  y0_offset = (width/2) * (border/2);
  uv0_offset = (width/4) * (border/4);

  sub2ref[Y] = bufp[4] + y0_offset;
  sub2ref[V] = bufp[4] + (sub2size / 6 * 4) + uv0_offset;
  sub2ref[U] = bufp[4] + (sub2size / 6 * 5) + uv0_offset;

  sub2avg[Y] = bufp[5] + y0_offset;
  sub2avg[V] = bufp[5] + (sub2size / 6 * 4) + uv0_offset;
  sub2avg[U] = bufp[5] + (sub2size / 6 * 5) + uv0_offset;

  sub4size = size / 4 / 4;
  y0_offset = (width/4) * (border/4);
  uv0_offset = (width/8) * (border/8);

  sub4ref[Y] = bufp[6] + y0_offset;
  sub4ref[V] = bufp[6] + (sub4size / 6 * 4) + uv0_offset;
  sub4ref[U] = bufp[6] + (sub4size / 6 * 5) + uv0_offset;

  sub4avg[Y] = bufp[7] + y0_offset;
  sub4avg[V] = bufp[7] + (sub4size / 6 * 4) + uv0_offset;
  sub4avg[U] = bufp[7] + (sub4size / 6 * 5) + uv0_offset;

  refer[Y] = src;
  refer[V] = src + offs_v;
  refer[U] = src + offs_u;

  Copy_frame(avg, refer, W, H, W, H);
  Copy_frame(avg2, refer, W, H, W, H);
  Copy_frame(tmp, refer, W, H, W, H);

  memcpy(pre_v, src+offs_v, size_uv);
  memcpy(pre_u, src+offs_u, size_uv);

#ifdef ARCH_X86
  cpu_accel = mm_accel();
  if (cpu_accel & MM_ACCEL_X86_MMXEXT) {
    Calc_SAD = Calc_SAD_mmxext;
    Calc_SAD_uv = Calc_SAD_uv_mmxext;
    Calc_SAD_half = Calc_SAD_half_mmxext;
    Low_contrast_block = Low_contrast_block_mmxext;
    Subsample_frame = Subsample_frame_mmxext;
    Sharpen_frame = Sharpen_frame_mmxext;
    UV_deflick_block = UV_deflick_block_mmxext;
    Blur_frame = Blur_frame_mmxext;
    Mv_blocks = Mv_blocks_mmxext;
    Avg_frame = Avg_frame_mmxext;
    Correct_frame2 = Correct_frame2_mmxext;
    Denoise_frame_pass2 = Denoise_frame_pass2_mmxext;
  } else {
    Calc_SAD = Calc_SAD_c;
    Calc_SAD_uv = Calc_SAD_uv_c;
    Calc_SAD_half = Calc_SAD_half_c;
    Low_contrast_block = Low_contrast_block_c;
    Subsample_frame = Subsample_frame_c;
    Sharpen_frame = Sharpen_frame_c;
    UV_deflick_block = UV_deflick_block_c;
    Blur_frame = Blur_frame_c;
    Mv_blocks = Mv_blocks_c;
    Avg_frame = Avg_frame_c;
    Correct_frame2 = Correct_frame2_c;
    Denoise_frame_pass2 = Denoise_frame_pass2_c;
  }

  for (x = 0; x < 4; x++) {
    low_threshold.w[x] = 2*Threshold/3;
  }
  for (x = 0; x < 4; x++) {
    sharp.w[x] = Sharpen;
  }
  for (x = 0; x < 4; x++) {
    thre_uv.w[x] = Threshold_uv;
  }
  for (x = 0; x < 4; x++) {
    mdelay.w[x] = (Delay << 8);
  }
  for (x = 0; x < 4; x++) {
    thre.w[x] = Threshold;
  }
#if 0
  for (x = 0; x < 4; x++) {
    per_thre.uw[x] = WEIGHTONE/Threshold;
  }
#endif
  for (x = 0; x < 4; x++) {
    per_thre.w[x] = 256/Threshold;
  }
  for (x = 0; x < 4; x++) {
    thre_pp.w[x] = Threshold_pp;
  }
  for (x = 0; x < 4; x++) {
    per_thre_pp.w[x] = 256/Threshold_pp;
  }
#else
  Calc_SAD = Calc_SAD_c;
  Calc_SAD_uv = Calc_SAD_uv_c;
  Calc_SAD_half = Calc_SAD_half_c;
  Low_contrast_block = Low_contrast_block_c;
  Subsample_frame = Subsample_frame_c;
  Sharpen_frame = Sharpen_frame_c;
  UV_deflick_block = UV_deflick_block_c;
  Blur_frame = Blur_frame_c;
  Mv_blocks = Mv_blocks_c;
  Avg_frame = Avg_frame_c;
  Correct_frame2 = Correct_frame2_c;
  Denoise_frame_pass2 = Denoise_frame_pass2_c;
#endif /* ARCH_X86 */
}

static void
Denoise(uint8_t *dest, uint8_t *src)
{
  int W = Width;
  int H = Height;

  if (Uninitialized) {
    Uninitialized = 0;
    Initialize(src, W, H);
  }

  refer[Y] = src;
  refer[V] = src + offs_v;
  refer[U] = src + offs_u;
  dfrm[Y] = dest;
  dfrm[V] = dest + offs_v;
  dfrm[U] = dest + offs_u;

  Denoise_frame();

}

int
denoise(uint8_t *dest, uint8_t *src)
{
  if (csp == CSP_YV12 || csp == CSP_YUV420P || csp == CSP_I420) {
    Denoise(dest, src);
  } else if (csp == CSP_RGB24) {
    fprintf(stderr, "denoise: not implement\n");
    memcpy(dest, src, Width * Height * 24 / 8);
    return -1;
  } else {
    fprintf(stderr, "denoise: not implement\n");
    memcpy(dest, src, Width * Height * 12 / 8);
    return -1;
  }
  return 0;
}

int
denoise_set_param(int radius, int threshold, int pp_threshold, int delay, int sharpen, int denoise_uv, int uv_threshold)
{
  if (radius <= 0)             Radius = 1;
  else if (radius > 16)        Radius = 16;
  else	                       Radius = radius;

  if (threshold <= 0)          Threshold = 1;
  else if (threshold > 255)    Threshold = 255;
  else                         Threshold = threshold;

  if (pp_threshold <= 0)       Threshold_pp = 1;
  else if (pp_threshold > 255) Threshold_pp = 255;
  else                         Threshold_pp = pp_threshold;

  if (delay <= 0)              Delay = 1;
  else if (delay > 300)        Delay = 300;
  else                         Delay = delay;

  if (sharpen <= 0)            Sharpen = 1;
  else if (sharpen > 255)      Sharpen = 255;
  else                         Sharpen = sharpen;

  if (denoise_uv)              Denoise_uv_flag = 1;
  else                         Denoise_uv_flag = 0;

  if (uv_threshold <= 0)       Threshold_uv = 1;
  else if (uv_threshold > 255) Threshold_uv = 255;
  else                         Threshold_uv = uv_threshold;

  printf("denoise radius: %d\n", Radius);
  printf("denoise threshold: %d\n", Threshold);
  printf("denoise ppthreshold: %d\n", Threshold_pp);
  printf("denoise delay: %d\n", Delay);
  printf("denoise sharpen: %d\n", sharpen);
  if (Denoise_uv_flag) {
    printf("denoise uv: on\n");
    printf("denoise uvthreshold: %d\n", Threshold_uv);
  } else {
    printf("denoise uv: off\n");
  }
  return 0;
}

int
denoise_set_test(const char *str)
{
  int ch;

  if (str == NULL) {
    test_mode = 0;
    return 0;
  }

  ymask = umask = vmask = 1;
  while ((ch = *str++) != '\0') {
    switch (ch) {
      case '1': test_mode = 1; break;
      case '2': test_mode = 2; break;
      case '3': test_mode = 3; break;
      case '4': test_mode = 4; break;
      case '5': test_mode = 5; break;
      case 'y':
      case 'Y': ymask = 0; break;
      case 'u':
      case 'U': umask = 0; break;
      case 'v':
      case 'V': vmask = 0; break;
    }
  }
  if (ymask && umask && vmask) {
    ymask = umask = vmask = 0;
  }
  return 0;
}

int
denoise_init(int width, int height, int color_space)
{
  Width = width;
  Height = height;

  if (width <= 0 || height <= 0) {
    fprintf(stderr, "denoise_init: width height invalid, width %d, height %d\n",
	width, height);
    return -1;
  }

  if (color_space != CSP_YV12 && color_space != CSP_YUV420P && color_space != CSP_I420) {
    fprintf(stderr, "denoise_init: request unsupported color space, support color space is YV12 only.\n");
    return -1;
  }

  Uninitialized = 1;
  csp = color_space;
  return 0;
}

