/* denoise_deinterlace.c (C) nejik 2003 */

/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 * References:
 * mjpegtools    (http://mjpeg.sourceforge.net/)
 * mjpegtools-1.6.1/yuvdenoise/deinterlace.c
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <inttypes.h>

#include "attributes.h"
#include "mmx.h"
#include "denoise_deinterlace.h"

void 
denoise_deinterlace_noaccel(uint8_t *dest, uint8_t *src, int width, int height)
{
  unsigned int d=0;
  unsigned int min;
  register int x;
  register int y;
  register int xx;
  register int i;
  int xpos;
  int l1;
  int l2;
  int lumadiff = 0;
  uint8_t line_buf[width]; 
  uint8_t *line = line_buf;
  int H = height;
  int W = width;

//  printf("denoise deinterlace noaccel\n");

  /* Go through the frame by every two lines */
  for (y = 0, H -= 2; y < H ; y += 2)
    {

      memcpy(dest + y * W, src + y * W, W);

      /* Go through each line by a "block" length of 32 pixels */
      for (x = 0; x < W ; x += 8)
        {
          /* search best matching block of pixels in other field line */
          min = 65535;
          xpos = 0;
          /* search in the range of +/- 8 pixels offset in the line */

          for (xx = -8; xx < 8; xx++)
            {
              d = 0;
              /* Calculate SAD */
              for (i = -8; i < 16; i++)
                {
                  /* to avoid blocking in ramps we analyse the best match on */
                  /* two lines ... */
                  d += (int) abs (*(src + (x + i) + y * W) -
                                  *(src + (x + xx + i) + (y + 1) * W));
                  d += (int) abs (*(src + (x + i) + (y + 2) * W) -
                                  *(src + (x + xx + i) + (y + 1) * W));
                }


              /* if SAD reaches a minimum store the position */
              if (min > d)
                {
                  min = d;
                  xpos = xx;
                  
                  l1 = l2 = 0;
                  for (i = 0; i < 8; i++)
                    {
                      l1 += *(src + (x + i) + y * W);
                      l2 += *(src + (x + i + xpos) + (y + 1) * W);
                    }
                  l1 /= 8;
                  l2 /= 8;
                  lumadiff = abs (l1 - l2);
                  lumadiff = (lumadiff < 8) ? 0 : 1;
                }
            }

          /* copy pixel-block into the line-buffer */

          /* if lumadiff is small take the fields block, if not */
          /* take the other fields block */

          if (lumadiff || min > (12 * 24))
            for (i = 0; i < 8; i++) /* average pixels :( */
              {
                *(line + x + i) =
                  (*(src + (x + i) + ((y) * W)) >>1) +
                  (*(src + (x + i) + ((y + 2) * W))>>1) + 1;
              }
          else
            for (i = 0; i < 8; i++) /* move block :) */
              {
                *(line + x + i) =
                  (*(src + (x + i + xpos) + ((y + 1) * W)) >>1) +
                  (*(src + (x + i) + ((y + 0) * W)) >> 1) + 1;
              }
        }

      /* copy the line-buffer into the source-line */
      for (i = 0; i < W; i++)
        *(dest + i + (y + 1) * W) = *(line + i);
    }
    memcpy(dest + y * W, src + y * W, W);
    y++;
    memcpy(dest + y * W, src + y * W, W);
}

#ifdef ARCH_X86
void 
denoise_deinterlace_mmx(uint8_t *dest, uint8_t *src, int width, int height)
{
  uint32_t d=0;
//  uint16_t a[4]={0,0,0,0};
  unsigned int min;
  register int x;
  register int y;
  int xx;
  int i;
  int xpos;
  int l1;
  int l2;
  int lumadiff = 0;
  uint8_t line_buf[width]; 
  uint8_t *line = line_buf;
  uint8_t *ref1;
  uint8_t *ref2;
  uint8_t *ref3;
  int W = width;
  int H = height;
  int calc_start;
  int calc_num;
  
//  printf("denoise deinterlace mmx\n");

  /* Go through the frame by every two lines */
  for (y = 0, H -= 2; y < H ; y += 2)
    {

      if (y == 0) {
        calc_start = 0;
        calc_num = 1;
      } else {
        calc_start = -8;
        calc_num = 3;
      }

      memcpy(dest + y * W, src + y * W, W);

      /* Go through each line by a "block" length of 32 pixels */
      for (x = 0; x < W ; x += 8)
        {
          /* search best matching block of pixels in other field line */
          min = 65535;
          xpos = 0;
          /* search in the range of +/- 8 pixels offset in the line */

          for (xx = -8; xx < 8; xx++)
            {
              
              ref1=src+x+y*W + calc_start;      /* not displaced */
              ref2=src+x+y*W+W*2+calc_start; /* not displaced two lines below */
              ref3=src+x+y*W+W+xx + calc_start; /* displaced one line below */
              
	      pxor_r2r(mm0, mm0);
	      pxor_r2r(mm7, mm7);

	      for (i = 0; i < calc_num; i++) {
	      movq_m2r(*ref1, mm1);
	      movq_m2r(*ref3, mm2);
	      movq_r2r(mm2, mm4);
	      movq_r2r(mm2, mm3);
	      psubusb_r2r(mm1, mm3);
	      psubusb_r2r(mm2, mm1);
	      paddusb_r2r(mm3, mm1);
	      movq_r2r(mm1, mm2);
	      punpcklbw_r2r(mm7, mm2);
	      punpckhbw_r2r(mm7, mm1);
	      paddusw_r2r(mm1, mm0);
	      paddusw_r2r(mm2, mm0);
	      movq_m2r(*ref2, mm1);
	      //movq_m2r(*ref3, mm2);
	      movq_r2r(mm4, mm2);
	      movq_r2r(mm2, mm3);
	      psubusb_r2r(mm1, mm3);
	      psubusb_r2r(mm2, mm1);
	      paddusb_r2r(mm3, mm1);
	      movq_r2r(mm1, mm2);
	      punpcklbw_r2r(mm7, mm2);
	      punpckhbw_r2r(mm7, mm1);
	      paddusw_r2r(mm1, mm0);
	      paddusw_r2r(mm2, mm0);

	      ref1 += 8;
	      ref2 += 8;
	      ref3 += 8;
	      }

	      movq_r2r(mm0, mm1);
	      punpcklwd_r2r(mm7, mm0);
	      punpcklwd_r2r(mm7, mm1);
	      paddd_r2r(mm1, mm0);
	      movq_r2r(mm0, mm1);
	      psrlq_i2r(32, mm1);
	      paddd_r2r(mm1, mm0);
	      movd_r2m(mm0, d);

#if 0
              asm volatile (
                  " pxor        %%mm0 , %%mm0;           /* clear mm0                                          */\n"
                  " pxor        %%mm7 , %%mm7;           /* clear mm7                                          */\n"
                  "                                      /*                                                    */\n"
                  " movl         %1    , %%eax;          /* load frameadress into eax                          */\n"
                  " movl         %2    , %%ebx;          /* load frameadress into ebx                          */\n"
                  " movl         %3    , %%ecx;          /* load frameadress into ecx                          */\n"
                  "                                      /*                                                    */\n"
                  " .rept 3                              /* repeat 3 times                                     */\n"
                  " movq        (%%eax), %%mm1;          /* 8 Pixels from line                                 */\n"
                  " movq        (%%ecx), %%mm2;          /* 8 Pixels from displaced                            */\n"
                  " movq         %%mm2 , %%mm3;          /* hold a copy of mm2 in mm3                          */\n"
                  " psubusb      %%mm1 , %%mm3;          /* positive differences between mm2 and mm1           */\n"
                  " psubusb      %%mm2 , %%mm1;          /* positive differences between mm1 and mm3           */\n"
                  " paddusb      %%mm3 , %%mm1;          /* mm1 now contains abs(mm1-mm2)                      */\n"
                  " movq         %%mm1 , %%mm2;          /* copy mm1 to mm2                                    */\n"
                  " punpcklbw    %%mm7 , %%mm1;          /* unpack mm1 into mm1 and mm2                        */\n"
                  " punpckhbw    %%mm7 , %%mm2;          /*                                                    */\n"
                  " paddusw      %%mm1 , %%mm0;          /* add mm1 (stored in mm1 and mm2...)                 */\n"
                  " paddusw      %%mm2 , %%mm0;          /* to mm0                                             */\n"
                  "                                      /*                                                    */\n"
                  " movq        (%%ebx), %%mm1;          /* 8 Pixels from line                                 */\n"
                  " movq        (%%ecx), %%mm2;          /* 8 Pixels from displaced line                       */\n"
                  " movq         %%mm2 , %%mm3;          /* hold a copy of mm2 in mm3                          */\n"
                  " psubusb      %%mm1 , %%mm3;          /* positive differences between mm2 and mm1           */\n"
                  " psubusb      %%mm2 , %%mm1;          /* positive differences between mm1 and mm3           */\n"
                  " paddusb      %%mm3 , %%mm1;          /* mm1 now contains abs(mm1-mm2)                      */\n"
                  " movq         %%mm1 , %%mm2;          /* copy mm1 to mm2                                    */\n"
                  " punpcklbw    %%mm7 , %%mm1;          /* unpack mm1 into mm1 and mm2                        */\n"
                  " punpckhbw    %%mm7 , %%mm2;          /*                                                    */\n"
                  " paddusw      %%mm1 , %%mm0;          /* add mm1 (stored in mm1 and mm2...)                 */\n"
                  " paddusw      %%mm2 , %%mm0;          /* to mm0                                             */\n"
                  " addl         $8    , %%eax;          /* add 8 to frameaddress                              */\n"
                  " addl         $8    , %%ebx;          /* add 8 to frameaddress                              */\n"
                  " addl         $8    , %%ecx;          /* add 8 to frameaddress                              */\n"
                  " .endr                                /* end loop                                           */\n"
                  "                                      /*                                                    */\n"
                  " movq         %%mm0 , %0   ;          /* make mm0 available to gcc ...                      */\n"
                  " emms                      ;          /* clear mmx state                                    */\n"
                  :"=m" (a)
                  :"m" (ref1), "m" (ref2), "m" (ref3)
                  :"%eax", "%ebx", "%ecx"
                );
              
              d=a[0]+a[1]+a[2]+a[3];
#endif
              
              /* if SAD reaches a minimum store the position */
              if (min > d)
                {
                  min = d;
                  xpos = xx;
                  
                  l1 = l2 = 0;
                  for (i = 0; i < 8; i++)
                    {
                      l1 += *(src + (x + i) + y * W);
                      l2 += *(src + (x + i + xpos) + (y + 1) * W);
                    }
                  l1 /= 8;
                  l2 /= 8;
                  lumadiff = abs (l1 - l2);
                  lumadiff = (lumadiff < 8) ? 0 : 1;
                }
            }

          /* copy pixel-block into the line-buffer */

          /* if lumadiff is small take the fields block, if not */
          /* take the other fields block */

          if (lumadiff || min > (12 * 24))
            for (i = 0; i < 8; i++) /* average pixels :( */
              {
                *(line + x + i) =
                  (*(src + (x + i) + ((y) * W)) >>1) +
                  (*(src + (x + i) + ((y + 2) * W))>>1) + 1;
              }
          else
            for (i = 0; i < 8; i++) /* move block :) */
              {
                *(line + x + i) =
                  (*(src + (x + i + xpos) + ((y + 1) * W)) >>1) +
                  (*(src + (x + i) + ((y + 0) * W)) >> 1) + 1;
              }
        }

      /* copy the line-buffer into the source-line */
      for (i = 0; i < W; i++)
        *(dest + i + (y + 1) * W) = *(line + i);

    }
    emms();
    memcpy(dest + y * W, src + y * W, W);
    y++;
    for (i = 0; i < W; i++)
      *(dest + i + y * W) = (*(src + i + ((y - 1) * W)) >>1) +
                            (*(src + i +  (y * W)) >> 1);
}

void 
denoise_deinterlace_mmxext(uint8_t *dest, uint8_t *src, int width, int height)
{
  uint32_t d=0;
  unsigned int min;
  register int x;
  register int y;
  int xx;
  int i;
  int xpos;
  int l1;
  int l2;
  int lumadiff = 0;
  uint8_t line_buf[width]; 
  uint8_t *line = line_buf;
  uint8_t *ref1;
  uint8_t *ref2;
  uint8_t *ref3;
  int W = width;
  int H = height;
  int calc_start;
  int calc_num;
  
  /* Go through the frame by every two lines */
  for (y = 0, H -= 2; y < H ; y += 2)
    {

      if (y == 0) {
        calc_start = 0;
        calc_num = 1;
      } else {
        calc_start = -8;
        calc_num = 3;
      }

      memcpy(dest + y * W, src + y * W, W);

      /* Go through each line by a "block" length of 32 pixels */
      for (x = 0; x < W ; x += 8)
        {
          /* search best matching block of pixels in other field line */
          min = 65535;
          xpos = 0;
          /* search in the range of +/- 8 pixels offset in the line */

          for (xx = -8; xx < 8; xx++)
            {
              
              ref1=src+x+y*W + calc_start;      /* not displaced */
              ref2=src+x+y*W+W*2+calc_start; /* not displaced two lines below */
              ref3=src+x+y*W+W+xx + calc_start; /* displaced one line below */
              
	      pxor_r2r(mm0, mm0);

	      for (i = 0; i < calc_num; i++) {
	      movq_m2r(*ref3, mm1);
	      movq_r2r(mm1, mm2);
	      psadbw_m2r(*ref1, mm1);
	      psadbw_m2r(*ref2, mm2);
	      paddd_r2r(mm1, mm0);
	      paddd_r2r(mm2, mm0);

	      ref1 += 8;
	      ref2 += 8;
	      ref3 += 8;
	      }

	      movd_r2m(mm0, d);

              /* if SAD reaches a minimum store the position */
              if (min > d)
                {
                  min = d;
                  xpos = xx;
                  
                  l1 = l2 = 0;
                  for (i = 0; i < 8; i++)
                    {
                      l1 += *(src + (x + i) + y * W);
                      l2 += *(src + (x + i + xpos) + (y + 1) * W);
                    }
                  l1 /= 8;
                  l2 /= 8;
                  lumadiff = abs (l1 - l2);
                  lumadiff = (lumadiff < 8) ? 0 : 1;
                }
            }

          /* copy pixel-block into the line-buffer */

          /* if lumadiff is small take the fields block, if not */
          /* take the other fields block */

          if (lumadiff || min > (12 * 24))
            for (i = 0; i < 8; i++) /* average pixels :( */
              {
                *(line + x + i) =
                  (*(src + (x + i) + ((y) * W)) >>1) +
                  (*(src + (x + i) + ((y + 2) * W))>>1) + 1;
              }
          else
            for (i = 0; i < 8; i++) /* move block :) */
              {
                *(line + x + i) =
                  (*(src + (x + i + xpos) + ((y + 1) * W)) >>1) +
                  (*(src + (x + i) + ((y + 0) * W)) >> 1) + 1;
              }
        }

      /* copy the line-buffer into the source-line */
      for (i = 0; i < W; i++)
        *(dest + i + (y + 1) * W) = *(line + i);

    }
    emms();
    memcpy(dest + y * W, src + y * W, W);
    y++;
    for (i = 0; i < W; i++)
      *(dest + i + y * W) = (*(src + i + ((y - 1) * W)) >>1) +
                            (*(src + i +  (y * W)) >> 1);
}

#endif

