/* deinterlace.c (C) nejik 2003 */

/*
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *
 * Referenced:
 * xaw-deinterlaceerlace
 * http://xaw-deinterlace.sourceforge.net/
 *
 * Copyright:
 * Conrad Kreyling <conrad@conrad.nerdland.org>
 * Patrick Barrett <yebyen@nerdland.org
 * This is licenced under the GNU GPL until someone tells me I'm stealing code
 * and can't do that ;) www.gnu.org for any version of the license.
 *
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

#ifdef ARCH_X86
#include <attributes.h>
#include <mmx.h>
#include <mm_accel.h>
#endif

#include "csp.h"
#include "util.h"
#include "deinterlace.h"
#include "denoise_deinterlace.h"

static int width;
static int height;
static int bytes_per_line;
static int stride_bytes_per_line;
static int depth;
static int uv = 0;
static int v_offset = 0;
static int u_offset = 0;
static int uv_size = 0;
static int y_only = 0;
static int uv_only = 0;

static void linear_blend (uint8_t *dest_buf, uint8_t *image);

DeinterlaceFunc deinterlace = linear_blend;

#define load_line memcpy

int
deinterlace_set_y_only(int flag)
{
  if (flag)
    y_only = 1;
  else
    y_only = 0;
  if (y_only)
    uv_only = 0;
  return y_only;
}

int
deinterlace_set_uv_only(int flag)
{
  if (flag)
    uv_only = 1;
  else
    uv_only = 0;
  if (uv_only)
    y_only = 0;
  return uv_only;
}

#ifdef ARCH_X86

static inline void
load_line_mmx(uint8_t *dest, uint8_t *src, int size)
{
  int i;
  for (i = 0; i < size; i+=8) {
    movq_m2r (*src, mm0);
    //movntq_r2m (mm0, *dest);
    movq_r2m (mm0, *dest);
    src += 8;
    dest += 8;
  }
}

static inline void
linear_blend_line_mmx(uint8_t *dest, uint8_t *line0, uint8_t *line1, uint8_t *line2, int size)
{
  int i;
  for (i = 0; i < size; i += 8) {
    movq_m2r (*line0, mm0);        // Line0
    movq_m2r (*line2, mm1);        // Line2
    pavgb_r2r (mm1, mm0);          // ( Line0 + Line2 ) / 2 = mm0
    movq_m2r (*line1, mm2);        // Line1
    pavgb_r2r (mm2, mm0);          // ( Line1 + (( Line0 + Line2 ) / 2 )) / 2
//    movntq_r2m (mm0, *dest);       // put data to dest
    movq_r2m (mm0, *dest);         // put data to dest
    line0 += 8;
    line1 += 8;
    line2 += 8;
    dest += 8;
  }
}

static void
linear_blend_mmx(uint8_t *dest_buf, uint8_t *image)
{
  int i;
  int size = bytes_per_line;
  int item_num = height;
  int stride = stride_bytes_per_line;
  int uv_count;
  uint8_t *src;
  uint8_t *dest;
  uint8_t buf[3][size];
  uint8_t *line[3];
  uint8_t *line_tmp;

  dest = dest_buf;
  src = image;

  line[0] = buf[0];
  line[1] = buf[1];
  line[2] = buf[2];

  if (!uv || !uv_only) {
    i = 0;
    load_line_mmx(line[1], src, size);
    load_line_mmx(line[2], src, size);
    src += stride;
    i++;

    for (; i < item_num; i++) {
      line_tmp = line[0];
      line[0] = line[1];
      line[1] = line[2];
      line[2] = line_tmp;
      load_line_mmx(line[2], src, size);
      src += stride;
      linear_blend_line_mmx(dest, line[0], line[1], line[2], size);
      dest += size;
    }
    linear_blend_line_mmx(dest, line[1], line[2], line[2], size);
    dest += size;
  } else {
    memcpy(dest, src, size * item_num);
    src += stride * item_num;
    dest += size * item_num;
  }

  if (uv) {
    if (!y_only) {
      size = size >> 1;
      item_num = item_num >> 1;
      stride = stride >> 1;
      for (uv_count = 2; uv_count > 0; uv_count--) {
        i = 0;
        load_line_mmx(line[1], src, size);
        load_line_mmx(line[2], src, size);
        src += stride;
        i++;

        for (; i < item_num; i++) {
          line_tmp = line[0];
          line[0] = line[1];
          line[1] = line[2];
          line[2] = line_tmp;
          load_line_mmx(line[2], src, size);
          src += stride;
          linear_blend_line_mmx(dest, line[0], line[1], line[2], size);
          dest += size;
        }
        linear_blend_line_mmx(dest, line[1], line[2], line[2], size);
        dest += size;
      }
    } else {
      memcpy(dest_buf + v_offset, image + v_offset, u_offset - v_offset);
      memcpy(dest_buf + u_offset, image + u_offset, u_offset - v_offset);
    }
  }

  emms();
}

static inline void
bilinear_line_mmx(uint8_t *dest, uint8_t *line0, uint8_t *line1, int size)
{
  int i;
  for (i = 0; i < size; i += 8) {
    movq_m2r (*line0, mm0);        // Line0
    movq_m2r (*line1, mm1);        // Line2
    pavgb_r2r (mm1, mm0);          // ( Line0 + Line2 ) / 2 = mm0
//    movntq_r2m (mm0, *dest);       // put data to dest
    movq_r2m (mm0, *dest);         // put data to dest
    line0 += 8;
    line1 += 8;
    dest += 8;
  }
}

static void
bilinear_mmx(uint8_t *dest_buf, uint8_t *image)
{
  int i;
  int size = bytes_per_line;
  int item_num = height;
  int stride = stride_bytes_per_line;
  uint8_t *src0;
  uint8_t *src1;
  uint8_t *dest;

  stride *= 2;
  item_num /= 2;

  dest = dest_buf;
  src0 = image;
  src1 = image + stride;

  item_num--;

  for (i = 0; i < item_num; i++) {
    load_line_mmx(dest, src0, size);
    dest += size;
    bilinear_line_mmx(dest, src0, src1, size);
    src0 += stride;
    src1 += stride;
    dest += size;
  }
  load_line_mmx(dest, src0, size);

  if (uv) {
    memcpy(dest_buf + v_offset, image + v_offset, u_offset - v_offset);
    memcpy(dest_buf + u_offset, image + u_offset, u_offset - v_offset);
  }

  emms();
}

static void
denoise_deint_mmx(uint8_t *dest, uint8_t *image)
{
  denoise_deinterlace_mmx(dest, image, width, height);
  memcpy(dest + v_offset, image + v_offset, uv_size);
  memcpy(dest + u_offset, image + u_offset, uv_size);
}

static void
denoise_deint_mmxext(uint8_t *dest, uint8_t *image)
{
  denoise_deinterlace_mmxext(dest, image, width, height);
  memcpy(dest + v_offset, image + v_offset, uv_size);
  memcpy(dest + u_offset, image + u_offset, uv_size);
}

#endif /* ARCH_X86 */

static inline void
linear_blend_line(uint8_t *dest, uint8_t *line0, uint8_t *line1, uint8_t *line2, int size)
{
  int i;
  for (i = 0; i < size; i++)
    dest[i] = (line0[i] + line1[i]*2 + line2[i]) >> 2;
}

static inline void
bilinear_line (uint8_t *dest, uint8_t *line0, uint8_t *line1, int size)
{
  int i;
  for (i = 0; i < size; i++)
    dest[i] = (line0[i] + line1[i]) >> 1;
}


static void
linear_blend (uint8_t *dest_buf, uint8_t *image)
{
  int i;
  int size = bytes_per_line;
  int item_num = height;
  int stride = stride_bytes_per_line;
  int uv_count;
  uint8_t *src;
  uint8_t *dest;
  uint8_t buf[3][size];
  uint8_t *line[3];
  uint8_t *line_tmp;

  dest = dest_buf;
  src = image;

  line[0] = buf[0];
  line[1] = buf[1];
  line[2] = buf[2];

  if (!uv || !uv_only) {
    i = 0;
    load_line(line[1], src, size);
    load_line(line[2], src, size);
    src += stride;
    i++;

    for (; i < item_num; i++) {
      line_tmp = line[0];
      line[0] = line[1];
      line[1] = line[2];
      line[2] = line_tmp;
      load_line(line[2], src, size);
      src += stride;
      linear_blend_line(dest, line[0], line[1], line[2], size);
      dest += size;
    }
    linear_blend_line(dest, line[1], line[2], line[2], size);
    dest += size;
  } else {
    memcpy(dest, src, size * item_num);
    src += stride * item_num;
    dest += size * item_num;
  }

  if (uv) {
    if (!y_only) {
      size = size >> 1;
      item_num = item_num >> 1;
      stride = stride >> 1;
      for (uv_count = 2; uv_count > 0; uv_count--) {
        i = 0;
        load_line(line[1], src, size);
        load_line(line[2], src, size);
        src += stride;
        i++;

        for (; i < item_num; i++) {
          line_tmp = line[0];
          line[0] = line[1];
          line[1] = line[2];
          line[2] = line_tmp;
          load_line(line[2], src, size);
          src += stride;
          linear_blend_line(dest, line[0], line[1], line[2], size);
          dest += size;
        }
        linear_blend_line(dest, line[1], line[2], line[2], size);
        dest += size;
      }
    } else {
      memcpy(dest_buf + v_offset, image + v_offset, u_offset - v_offset);
      memcpy(dest_buf + u_offset, image + u_offset, u_offset - v_offset);
    }
  }
}

static void
bilinear (uint8_t *dest_buf, uint8_t *image)
{
  int i;
  int size = bytes_per_line;
  int item_num = height;
  int stride = stride_bytes_per_line;
  uint8_t *src0;
  uint8_t *src1;
  uint8_t *dest;

  stride *= 2;
  item_num /= 2;

  dest = dest_buf;
  src0 = image;
  src1 = image + stride;

  item_num--;

  for (i = 0; i < item_num; i++) {
    load_line(dest, src0, size);
    dest += size;
    bilinear_line(dest, src0, src1, size);
    src0 += stride;
    src1 += stride;
    dest += size;
  }
  load_line(dest, src0, size);

  if (uv) {
    memcpy(dest_buf + v_offset, image + v_offset, u_offset - v_offset);
    memcpy(dest_buf + u_offset, image + u_offset, u_offset - v_offset);
  }
}

static void
linedoubling (uint8_t *dest_buf, uint8_t *image)
{
  int i;
  int size = bytes_per_line;
  int item_num = height;
  int src_stride = stride_bytes_per_line * 2;
  int dest_stride = size;
  uint8_t *src;
  uint8_t *dest;

  src = image;
  dest = dest_buf;

  item_num /= 2;
  for (i = 0; i < item_num; i++) {
    load_line(dest, src, size);
    dest += dest_stride;
    load_line(dest, src, size);
    dest += dest_stride;
    src += src_stride;
  }

  if (uv) {
    memcpy(dest_buf + v_offset, image + v_offset, u_offset - v_offset);
    memcpy(dest_buf + u_offset, image + u_offset, u_offset - v_offset);
  }
}

static void
denoise_deint(uint8_t *dest, uint8_t *image)
{
  denoise_deinterlace_noaccel(dest, image, width, height);
  memcpy(dest + v_offset, image + v_offset, uv_size);
  memcpy(dest + u_offset, image + u_offset, uv_size);
}

/* ------------------------------------------------------------------ */

typedef struct {
  const char *name;
  int id;
  DeinterlaceFunc func;
} DEINTERLACE_TYPE;

static DEINTERLACE_TYPE types[] = {
  {"bilinear", DEINTERLACE_BILINEAR, bilinear},
//  {"cubic", cubic},
  {"linear_blend", DEINTERLACE_LINEAR_BLEND, linear_blend},
  {"linedoubling", DEINTERLACE_LINEDOUBLING, linedoubling},
  {"yuvdenoise", DEINTERLACE_DENOISE, denoise_deint}
};

int num_types = sizeof(types)/sizeof(DEINTERLACE_TYPE);

#ifdef ARCH_X86
DeinterlaceFunc
deinterlace_accel(int type_id)
{
  uint32_t cpu_accel;

  cpu_accel = mm_accel();
  if (!(cpu_accel & (MM_ACCEL_X86_MMX|MM_ACCEL_X86_MMXEXT)))
    return deinterlace;

  switch (type_id) {
    case DEINTERLACE_BILINEAR:
      return bilinear_mmx;
    case DEINTERLACE_LINEAR_BLEND:
      return linear_blend_mmx;
    case DEINTERLACE_DENOISE:
      if (cpu_accel & MM_ACCEL_X86_MMXEXT)
        return denoise_deint_mmxext;
      else if (cpu_accel & MM_ACCEL_X86_MMX)
        return denoise_deint_mmx;
  }
  return deinterlace;
}
#endif

const char *
deinterlace_get_type(void)
{
  int i;
  const char *type = "NOT FOUND";
  DeinterlaceFunc func = deinterlace;

#ifdef ARCH_X86
  if (func == bilinear_mmx)
    func = bilinear;
  else if (func == linear_blend_mmx)
    func = linear_blend_mmx;
  else if (func == denoise_deint_mmx || func == denoise_deint_mmxext)
    func = denoise_deint;
#endif

  for (i=0; i<num_types; i++) {
    if (func == types[i].func) {
      return types[i].name;
    }
  }
  return type;
}

DeinterlaceFunc
deinterlace_find_type (const char *type, int *type_id)
{
  int i;

  deinterlace = NULL;
  for (i=0; i < num_types; i++) {
    if (type) {
      if (strcmp (types[i].name, type) == 0) {
        deinterlace = types[i].func;
	*type_id = types[i].id;
        break;
      }
    } else {
      if (*type_id == types[i].id) {
        deinterlace = types[i].func;
	break;
      }
    }
  }
  if (deinterlace == NULL) {
    fprintf (stderr, "deinterlace_find_func: can't find type %s\n", type);
    fprintf (stderr, "deinterlace types are ");
    i = 0;
    fprintf (stderr, "'%s'", types[i++].name);
    for (; i < num_types; i++) {
      fprintf (stderr, ",'%s'", types[i].name);
    }
    fprintf (stderr, "\n");
  }
  return deinterlace;
}

int
deinterlace_init (int image_width, int image_height, const char *type, int type_id, int in_csp)
{
  int src_stride = image_width;

  width = image_width;
  height = image_height;
  uv = 0;
  switch (in_csp) {
    case CSP_YUV420P:
    case CSP_I420:
      depth = 12;
      bytes_per_line = width;
      stride_bytes_per_line = src_stride;
      uv = 1;
      u_offset = width * height;
      v_offset = u_offset + (u_offset / 4);
      uv_size = width * height / 4;
      in_csp = CSP_YUV420P;
      break;
    case CSP_YV12:
      depth = 12;
      bytes_per_line = width;
      stride_bytes_per_line = src_stride;
      uv = 1;
      v_offset = width * height;
      u_offset = v_offset + (v_offset / 4);
      uv_size = width * height / 4;
      break;
    case CSP_RGB24:
      depth = 24;
      bytes_per_line = width * ((depth+7)/8);
      stride_bytes_per_line = src_stride * 3;
      break;
    case CSP_RGB32:
      depth = 32;
      bytes_per_line = width * ((depth+7)/8);
      stride_bytes_per_line = src_stride * 4;
      break;
    default:
      fprintf (stderr, "deinterlace_init: unknown color space\n");
      return -1;
  }
  deinterlace = linear_blend;
  deinterlace = deinterlace_find_type (type, &type_id);
  if (deinterlace == denoise_deint && in_csp == CSP_RGB24) {
    fprintf(stderr, "deinterlace_init: deinterlace type %s cannot be performed with CSP_RGB24.\n", deinterlace_get_type());
    return -1;
  }
  if (deinterlace == NULL) {
    fprintf(stderr, "deinterlace_init: deinterlace type %s not found use linear_blend\n", type);
    deinterlace = linear_blend;
  }
#ifdef ARCH_X86
  deinterlace = deinterlace_accel(type_id);
#endif
  return 0;
}

