///////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	DiacriticalNormalize.c
  Purpose:	A library function which converts a full PG "8-bit ASCII"
  		word into the likely PG "7-bit ASCII" form.
  Mods:		11/17/01 RSB	Began.
*/

///////////////////////////////////////////////////////////////////////////

/*
  This code is used to "normalize" an 8-bit ASCII string
  (representing HTML 4.0 characters) by removing diacritical
  marks and replacing ligatures.  This is needed in order
  to perform inexact searches of words from 7-bit ASCII
  PG etexts against the exact 8-bit forms found in the 
  wordlists. 
*/
#include <ctype.h>
#include "libGutenSpell.h"

//------------------------------------------------------------
// Table of the probable standard-ASCII replacements used
// In the PG etexts.  When the wordlists are checked, we
// match against both the exact forms, and against the
// replacement forms.
//
// Note that the sz and ae ligatures, though present in this
// table, are not handled by this table.  (I.e., they are 
// marked with 0.)  The reason for this is that they are 
// handled specifically by the software.
static const char ReplacementChars[] = {
  'A', 'A', 'A', 'A', 'A', 'A', 0, 'C',
/*192: 193: 194: 195: 196: 197: 198: 199: */
  'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N',
/*200: 201: 202: 203: 204: 205: 206: 207: 208: 209: */
  'O', 'O', 'O', 'O', 'O', 0, 'O', 'U', 'U', 'U',
/*210: 211: 212: 213: 214: 215: 216: 217: 218: 219: */
  'U', 'Y', 'P', 0, 'a', 'a', 'a', 'a', 'a', 'a',
/*220: 221: 222: 223: 224: 225: 226: 227: 228: 229: */
  0, 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
/*230: 231: 232: 233: 234: 235: 236: 237: 238: 239: */
  'd', 'n', 'o', 'o', 'o', 'o', 'o', 0, 'o', 'u',
/*240: 241: 242: 243: 244: 245: 246: 247: 248: 249: */
  'u', 'u', 'u', 'y', 'p', 'y'
/*250: 251: 252: 253: 254: 255: */
};

//------------------------------------------------------------
// Converts a character to lower case.

char
DiacriticalTolower (unsigned char c)
{
  if ((c >= 'A' && c <= 'Z') || (c >= 192 && c < 223 && c != 215))
    return (c | 0x20);
  else
    return (c);
}

//------------------------------------------------------------
// Converts a character to upper case.

char
DiacriticalToupper (unsigned char c)
{
  if ((c >= 'a' && c <= 'z') || (c >= 224 && c < 255 && c != 247))
    return (c & ~0x20);
  else
    return (c);
}

//----------------------------------------------------------
// Determine if a string is all-caps, taking diacritical 
// marks into account.

int
IsStrupr (const char *s)
{
  for (; *s; s++)
    if (*s != DiacriticalToupper (*s))
      return (0);
  return (1);
}

//----------------------------------------------------------
// Converts a string to lower case.

void
DiacriticalStrlwr (char *s)
{
  for (; *s; s++)
    *s = DiacriticalTolower (*s);
}

//----------------------------------------------------------
// Converts a string to upper case.

void
DiacriticalStrupr (char *s)
{
  for (; *s; s++)
    *s = DiacriticalToupper (*s);
}

//------------------------------------------------------------
// Here's the normalization function itself.  It returns the
// length of the normalized string, or 0 on error.  The 
// outlen parameter gives the amount of storage that has 
// been allocated for the output string.  

int
DiacriticalNormalize (const char *sin, char *sout, int outlen)
{
  char *send, *start;
  unsigned char c;
  start = sout;
  send = sout + outlen;
  for (; *sin; sin++)
    {
      c = (unsigned char) *sin;

      // Remove soft hyphens.
      if (c == 173)
	continue;

      // Take care of sz ligature.
      else if (c == 223)
	{
	  if (sout < send)
	    *sout++ = 's';
	  else
	    return (0);
	  if (sout < send)
	    *sout++ = 's';
	  else
	    return (0);
	}

      // Take care of ae ligature.
      else if (c == 230)
	{
	  if (sout < send)
	    *sout++ = 'a';
	  else
	    return (0);
	  if (sout < send)
	    *sout++ = 'e';
	  else
	    return (0);
	}

      // Take care of AE ligature.  It makes equal sense to 
      // convert this as AE or as Ae.  I believe the latter
      // is what more people would do.
      else if (c == 198)
	{
	  if (sout < send)
	    *sout++ = 'A';
	  else
	    return (0);
	  if (sout < send)
	    *sout++ = 'e';
	  else
	    return (0);
	}

      // Take care of regular characters.
      else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '\''
	       || c == '-')
	{
	  if (sout < send)
	    *sout++ = c;
	  else
	    return (0);
	}

      // Take care of diacriticals.     
      else if (c >= 192)
	{
	  c = ReplacementChars[c - 192];
	  if (c)
	    {
	      if (sout < send)
		*sout++ = c;
	    }
	  else
	    return (0);
	}

      // Unknown character!
      else
	return (0);
    }

  // All done.  Terminate and return length.  
  if (sout < send)
    {
      *sout = 0;
      return (sout - start);
    }
  else
    return (0);
}

//-----------------------------------------------------------------
// A test main program.

#ifdef TESTMAIN_NORMALIZE
#include <stdio.h>
char s[1000], ss[1000];
int i;
int
main (void)
{
  while (1 == scanf ("%s", s))
    {
      i = DiacriticalNormalize (s, ss, sizeof (ss));
      if (i)
	printf ("%d \"%s\" -> \"%s\"\n", i, s, ss);
      else
	printf ("%d \"%s\"\n", i, s);
    }
  return (0);
}
#endif
