///////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2001-2 Ronald S. Burkey 
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	MarkByChar.c 
  Purpose:	This analyzes and marks up a line by character and word. 
  Mods:		12/15/01 RSB	Split this off from the MarkBody function,
  				which had become incredibly complex and
				hard-to-read. 
		12/16/01 RSB	Now remove all soft spaces from the beginnings
				of lines.  Began adding all of the new
				italicizing styles I found on the PG
				newsgroup.
		08/03/02 RSB	Added Rev. and Messrs. and Gen. to the list of 
				honorifics.
		08/09/02 RSB	Attempted to add th --caps-ok switch.  Added
				'[' as a legitimate delimiter italics-starts.
				Also, ALL-CAPS is automatically turned off
				after enough explicit emphasizing is found.
		08/26/02 RSB	For elimination of ALL-CAPS, increased the
				count from 5 to 10.  Also, eliminated things
				like "_I_" from the count.
		11/20/02 RSB	Fixed smart-quotes for cases like this:
					... and then I says"--he 
					paused--"you don't know ...
				Fixed smart-singlequotes so that known cases
				like 'em (for "them"), 'til (for "until"), 
				and so on are treated properly as left-quotes.	
		11/22/02 RSB	Partially fixed the number of spaces in
				constructs like "I am here!" John said, and 
				"Am I here?" John asked.
		11/24/02 RSB	Added Hon. to the honorifics.			
*/

// Styles of italicization are indicated by the Status.Italicizing variable,  
// as follows: 
//      0       none.
//      1       All caps. 
//      2       HTML -- like <I>hello</I>. 
//      3       Like <hello>
//      ... and others, not necessarily in this order:
//              Like _hello_ 
//              Like /hello/ 
//              Like ~~hello~~ 
//              Like *hello*            12/16/01
//              Like ~hello~            12/16/01
//              Like _/hello/_          12/16/01
//              Like _*hello*_          12/16/01
//              Like */hello/*          12/16/01
//              Like _*/hello/*_        12/16/01 
//              Like /:hello:/          12/16/01
//              Like |:hello:|          12/16/01
//      
// As might be imagined, there's some difficulty distinguishing between  
// the styles <I>hello</I> and <hello>.  To this end, a variable called  
// BracketItalicsCount is kept; initially, the program guesses which style  
// is which, but after enough of one style or the other is encountered,  
// as recorded by BracketItalicsCount, the style is locked in. 

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "AutoMark.h"

// We spell these honorifics in reverse to make the search easier. 
// No need to add single-letter ones, because these are treated specially. 
// I've also added various names that appear in older texts as  
// abbreviations, like Jas., Chas., Ed., etc. 
static char *Honorifics[] = {
  "rM", "RM", "srM", "SRM", "rD", "RD", "srD", "SRD", "tS", "TS",
  "etS", "ETS", "arF", "ARF", "rF", "RF", "emM", "EMM", "sM", "SM",
  "emdM", "EMDM", "ellM", "ELLM", "snoM", "SNOM", "sahC", "SAHC",
  "saJ", "SAJ", "dE", "DE", "sirhC", "SIRHC", "oeG", "OEG", "derF",
  "DERF", "mW", "MW", "veR", "srsseM", "neG", "noH"
};

// Strings to auto-italicize.  Note that adding "PS." to the list has the 
// happy side effect of not adding spaces to it in end-of-sentence detection. 
static char *Italicized[] = {
  "Etc.", "etc.", "viz.", "ie.", "i.e.", "Ie.", "I.e.", "eg.",
  "e.g.", "Eg.", "E.g.", "et al.", "et cetera", "n.b.", "N.b.",
  "nota bene", "Nota bene", /* "M.", */ "Ste.", "Mme.", "Mdme.",
  "Mlle.", "Mons.", "PS." "PPS."
};
#define NUM_AUTO_ITALICIZED (sizeof(Italicized)/sizeof(Italicized[0]))
static int LengthItalicized[NUM_AUTO_ITALICIZED];
static int LengthsInitialized = 0;

// Words known to often begin with a single quote.
static char *SingleQuoteWords[] = {
  "em", "til"
};
#define NUM_SINGLE_QUOTE_WORDS (sizeof(SingleQuoteWords)/sizeof(SingleQuoteWords[0]))
static int LengthSingleQuoteWords[NUM_SINGLE_QUOTE_WORDS];

// Italicizing delimiters.
typedef struct
{
  int Length;
  char *Start;
  char *End;
}
ItalicizingStyle;
#define NO_ALLCAPS_TRIGGER 10
#define ITALICS_NONE 0
#define ITALICS_ALLCAPS 1
#define ITALICS_HTML 2
#define ITALICS_BRACKET 3
#define ITALICS_UNNAMED 4
ItalicizingStyle ItalicizingStyles[] = {
  {0, NULL, NULL},		// ITALICS_NONE
  {0, NULL, NULL},		// ITALICS_ALLCAPS
  {0, NULL, NULL},		// ITALICS_HTML
  {1, "<", ">"},		// ITALICS_BRACKET
  {1, "_", "_"},		// ITALICS_UNNAMED
  {1, "/", "/"},		// (etc.)
  {2, "~~", "~~"},
  {1, "*", "*"},
  {1, "~", "~"},
  {2, "_/", "/_"},
  {2, "_*", "*_"},
  {2, "*/", "/*"},
  {3, "_*/", "/*_"},
  {2, "/:", ":/"},
  {2, "|:", ":|"}
};
#define NUM_ITALICIZING_STYLES \
  (sizeof (ItalicizingStyles) / sizeof (ItalicizingStyle))

//---------------------------------------------------------------------
// Detects a legitimate character to appear after an italics-start
// delimiter.

static int
AfterItalicsDelimiter (char c)
{
  if (isspace (c))
    return (0);
  if (ispunct (c) && c != '[' && c != '(' && c != '\"' && c != '\'')
    return (0);  
  return (1);  
}

//---------------------------------------------------------------------
// A utility function to determine if a string  
// (like an italicization delimiter) is at the front of a word.
// Returns zero if not, the length of the string if so.
// AbsoluteStart is the pointer to the array in which the word is
// embedded.  DelimStart is a pointer into this string, indicating
// the possible delimiter we're trying to test.  Target is the 
// delimiter we're trying to test it against.

static int
AtWordStart (char *AbsoluteStart, char *DelimStart, char *Target)
{
  char *Prior, *Delim;
  Prior = DelimStart - 1;
  for (Delim = DelimStart; *Target; Target++, Delim++)
    if (*Target != *Delim)
      return (0);
  if (AfterItalicsDelimiter (*Delim) && (DelimStart == AbsoluteStart
					|| isspace (*Prior)
					|| *Prior ==
					'\'' || *Prior ==
					'\"' || *Prior ==
					'(' || *Prior == '-'
					|| *Prior == '['))
    return (Delim - DelimStart);
  else
    return (0);
}

//---------------------------------------------------------------------
// Returns 0 on success. 

int
MarkByChar (AnalysisDataset * Dataset, MarkStatus * Status, char *s)
{
  unsigned long LastLanguages = 0;
  int AtLineBeginning = 1;
  char *ss, *sss, *siv, *sv;
  char Normalized[MAXWORDLENGTH], Full[MAXWORDLENGTH],
    TestNormalized[MAXWORDLENGTH], TestFull[MAXWORDLENGTH];
  int j = 0, k, n;

  if (!LengthsInitialized)
    {
      LengthsInitialized = 1;
      for (j = 0; j < NUM_AUTO_ITALICIZED; j++)
	LengthItalicized[j] = strlen (Italicized[j]);
      for (j = 0; j < NUM_SINGLE_QUOTE_WORDS; j++)
	LengthSingleQuoteWords[j] = strlen (SingleQuoteWords[j]);
    }

  if (Status->Italicizing == ITALICS_NONE)
    Status->WordItalicizing = 0;
  for (ss = s; *ss; ss++)
    {
      // We remove all soft spaces we find at the beginnings of lines.
      if (AtLineBeginning && isspace (*ss) && *ss != '\n')
	{
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s,
	       MarkRemoveChar, 0))
	    goto DiskError;
	}
      else
	AtLineBeginning = 0;
      if (isspace (*ss) && ss != s && !isspace (ss[-1]))
	Status->LastFirstSpace = Status->LineInfo[0].Offset + ss - s;
      // Status->SentenceStart is a helpful flag for determining if  
      // a word may be at the start of a sentence.  It's used 
      // ONLY for the all-caps italicizaton style. 
      Status->SentenceStart |= *ss == '.' || *ss == '?' || *ss == '!'
	|| *ss == ':' || *ss == '\"';
      if (Status->SentenceStart)
	Status->NumBacktracks = Status->Foreignosity = 0;

      // Pick off words. 
      if (Status->InWord)
	{
	  if (IsWordChar (*ss))
	    {
	      if (Status->WordItalicizing && *ss != '\'')
		{
		  if (AddMarkup
		      (Dataset, Status->LineInfo[0].Offset + ss - s,
		       MarkTolower, 0))
		    goto DiskError;
		}
	    }
	  else
	    {
	      Status->InWord = 0;
	      if (Status->WordItalicizing
		  && Status->Italicizing != ITALICS_NONE)
		{
		  Status->WordItalicizing = Status->Italicizing =
		    ITALICS_NONE;
		  if (AddMarkup
		      (Dataset, Status->LineInfo[0].Offset + ss - s,
		       MarkEndItalics, 0))
		    goto DiskError;
		}
	    }
	}

      // We don't allow any WORD-based processing in the Prefatory area, 
      // because it's pointless.   
      if (!Status->InWord && IsWordChar (*ss) && *ss != '\''
	  && !Status->InPreface)
	{
	  char *sss;
	  int i;
	  int Matched;

	  // Locate the end of the word. 
	  for (sss = ss + 1;; sss++)
	    {
	      if (!*sss)
		break;
	      if (!IsWordChar (*sss))
		break;
	      if (*sss == '\'')
		{
		  i = IsWordChar (sss[1]);
		  if (i == WORD_NOT || 0 != (i & WORD_PUNCT))
		    break;
		}
	    }

	  // Process the word. 
	  if (sss == ss + 1)
	    Status->NumBacktracks = Status->Foreignosity = 0;
	  if (sss > ss + 1 && sss < ss + MAXWORDLENGTH)
	    {
	      Status->InWord = 1;
	      strncpy (Full, ss, sss - ss);
	      Full[sss - ss] = 0;
	      if (DiacriticalNormalize (Full, Normalized, MAXWORDLENGTH))
		{

		  // Because various of the markups which the analyses 
		  // below want to trigger conflict with each other, 
		  // we can't just start the markups when the analyses 
		  // identify the need for them.  Instead, we have to 
		  // set some state variables instead, and then after 
		  // all of the analyses are done resolve the conflicts. 
		  // Only then can we apply markup. 
		  int WantToItalicize = 0;
		  int WantFirstCharLower = 0;
		  int WantFirstCharUpper = 0;
		  int WantRestOfWordLower = 0;
		  int Want8Bit = 0;

		  // Process ALL-CAPS words against the wordlist created 
		  // from the etext, for the purpose of converting to  
		  // lower case and italicizing.  This is only interesting 
		  // if we're not already italicizing.
		  if (Dataset->CapsOk || !IsStrupr (Full))
		    Status->FirstWordArea = 0;
		  else if (Status->Italicizing == ITALICS_NONE
			   && !Status->InHeader1 && !Status->InSubtitle
			   && !(Dataset->
				FirstCapital && Status->FirstWordArea))
		    {
		      k = Status->SentenceStart;
		      strcpy (TestFull, Full);
		      strcpy (TestNormalized, Normalized);
		      DiacriticalStrlwr (&TestFull[Status->SentenceStart]);
		      DiacriticalStrlwr (&TestNormalized
					 [Status->SentenceStart]);
		      SearchWordlist (Dataset->Words, TestNormalized,
				      TestFull, &Matched);
		      if (!Matched && !Status->SentenceStart)
			{
			  TestFull[0] = toupper (TestFull[0]);
			  TestNormalized[0] = toupper (TestNormalized[0]);
			  k = 1;
			  SearchWordlist (Dataset->Words, TestNormalized,
					  TestFull, &Matched);
			}	// !Matched && ... 
		      if (Matched)
			{
			  WantToItalicize = Dataset->FirstItalics ||
			    !Status->FirstWordArea;
			  WantRestOfWordLower = 1;
			  if (!k)
			    {
			      WantFirstCharLower = 1;
			    }	// !k 
			}	// Matched 
		    }		// !Status->Italicizing && ... 

		  // Here we check the word against the global 
		  // wordlists/namelists.  This helps us to improve 
		  // ALL-CAPS italicizing, for words not found in the 
		  // etext-derived wordlist above.  Also, we can determine 
		  // if the word is foreign, or if it's a 7-bit form that 
		  // needs to be replaced with an 8-bit form.  In theory, 
		  // we could also remove soft-hyphens, but I'll leave 
		  // this for the future (if ever). 
		  i =
		    SearchWordlist (Dataset->Words, Normalized, Full,
				    &Matched);
		  if (Matched)	// This can't fail, but still ... 
		    {
		      j = Dataset->Words->Words[i].WordlistStatus;
		      if (Dataset->Words->Words[i].Frequent)
			j &= ~SPELL_NONNATIVE;

		      // If we're not already italicizing, we begin  
		      // italicizing for appropriate ALL-CAPS situations.
		      if (Dataset->CapsOk || !IsStrupr (Full))
			Status->FirstWordArea = 0;
		      else if (Status->Italicizing == ITALICS_NONE
			       && !Status->InHeader1 && !Status->InSubtitle
			       && !(Dataset->FirstCapital
				    && Status->FirstWordArea))
			{
			  if (0 != (j & SPELL_LOWERCASE)
			      || 0 != (j & SPELL_CAPITALIZED))
			    {
			      k =
				Status->
				SentenceStart | (j & SPELL_CAPITALIZED);
			      // 12/09/01 RSB:  was = 1 */
			      WantToItalicize = Dataset->FirstItalics ||
				!Status->FirstWordArea;
			      WantRestOfWordLower = 1;
			      if (!k)
				{
				  WantFirstCharLower = 1;
				}	// !k 
			    }	// 0 != ... 
			}	// !Status->Italicizing && ... 

		      // Next, do foreign words.  This is seemingly a lot  
		      // easier, because we are not so worried about  
		      // capitalization, but it's actually pretty tricky. 
		      // The problem is that many foreign words are also 
		      // recognizable as names or as words in the native 
		      // language of the etext, and hence we can easily 
		      // get into a situation where a foreign phrase shows 
		      // up as a mixture of italicized and non-italicized 
		      // words (which is very annoying).  Therefore, a  
		      // more complex recognition formula is needed.  What 
		      // we do is to compute an index of "foreignosity", 
		      // only partly related to specific words, and  
		      // italicize on the basis of this index.  We have 
		      // two data at our disposal, for individual words: 
		      // the SPELL_FOREIGN flag (which is set if the word 
		      // has ONLY been recognized as foreign) and the  
		      // SPELL_NONNATIVE flag (which is set if the word is 
		      // found in a foreign dictionary even if it was  
		      // previously found in a native dictionary). 
		      // Another difficulty, is that there's a tendency for 
		      // the beginnings of foreign phrases to be left off, 
		      // if the words correspond to native words.  So once 
		      // we reach an appropriate foreignosity threshhold, 
		      // we have to work backward to account for them.  
		      Status->LastForeignosity = Status->Foreignosity;
		      if (Status->SentenceStart)
			Status->Foreignosity = 0;
		      if (Dataset->Words->Words[i].LikelyName)
			Status->Foreignosity = 0;
		      else if (0 != (j & SPELL_FOREIGN))
			Status->Foreignosity = 256;
		      else if (0 != (j & SPELL_NONNATIVE))
			{
			  if (0 ==
			      (LastLanguages & Dataset->Words->Words[i].
			       Languages))
			    Status->Foreignosity = 0;
			}
		      else
			Status->Foreignosity = 0;
		      if (Status->Italicizing != ITALICS_NONE)
			Status->Foreignosity = 0;
		      LastLanguages = Dataset->Words->Words[i].Languages;
		      if (Status->Italicizing == ITALICS_NONE
			  && !Dataset->NoForeign)
			{
			  if (Status->Foreignosity >= 256)
			    {
			      if (Status->LastForeignosity < 256)
				{
				  for (k = 0; k < Status->NumBacktracks; k++)
				    {

				      // In doing the backtracks, we want to
				      // be consistent across languages.  We
				      // do this by making sure the language
				      // masks overlap.
				      if (0 ==
					  (Status->ForeignBacktracks[k].
					   Languages & Dataset->Words->
					   Words[i].Languages))
					break;

				      // This markup stuff is okay, 
				      // because it applies not to  
				      // the current word, but to  
				      // prior words where conflicts 
				      // have already been resolved. 
				      if (AddMarkup
					  (Dataset,
					   Status->ForeignBacktracks[k].Start,
					   MarkBeginItalics, 0))
					goto DiskError;
				      if (AddMarkup
					  (Dataset,
					   Status->ForeignBacktracks[k].End,
					   MarkEndItalics, 0))
					goto DiskError;
				    }
				}
			      Status->NumBacktracks = 0;

			      WantToItalicize = 1;
			    }	// Status->Foreignosity ... 
			  else if (0 != (j & SPELL_NONNATIVE)
				   && (Dataset->CapsOk || !IsStrupr (Full)))
			    {
			      if (Status->NumBacktracks < MAX_BACKTRACKS)
				{
				  Status->ForeignBacktracks[Status->
							    NumBacktracks].
				    Languages =
				    Dataset->Words->Words[i].Languages;
				  Status->ForeignBacktracks[Status->
							    NumBacktracks].
				    Start =
				    Status->LineInfo[0].Offset + ss - s;
				  Status->ForeignBacktracks[Status->
							    NumBacktracks].
				    End =
				    Status->ForeignBacktracks[Status->
							      NumBacktracks].
				    Start + strlen (Full);
				  Status->NumBacktracks++;
				}
			    }	// 0 != ... 
			  else
			    Status->NumBacktracks = 0;
			}	// !Status->Italicizing. 
		      else
			Status->NumBacktracks = 0;
		    }		// Matched                             

		  // Now, handle restoration of diacriticals, by replacing 
		  // 7-bit ASCII with 8-bit ASCII.    
		  if (0 != (j & SPELL_NORMALIZED)
		      && Dataset->Words->Words[i].Match != NULL
		      && !Dataset->NoDiacritical
		      && !Dataset->Words->Words[i].LikelyName)
		    Want8Bit = 1;

		  // Now, apply all of the markups implied by the flags
		  // we just set. 
		  k = Status->LineInfo[0].Offset + ss - s;
		  if (WantToItalicize)
		    {
		      if (AddMarkup (Dataset, k, MarkBeginItalics, 0))
			goto DiskError;
		    }
		  WantFirstCharUpper = 0;
		  if (Want8Bit)
		    {
		      sss = Dataset->Words->Words[i].Match;
		      if (isupper (*Dataset->Words->Words[i].Normalized)
			  && (Status->SentenceStart
			      || Dataset->Words->Words[i].NotAtBeginning))
			WantFirstCharUpper = 1;
		      WantFirstCharLower = 0;
		    }
		  else
		    sss = Dataset->Words->Words[i].Full;
		  if (WantFirstCharLower || WantRestOfWordLower || Want8Bit)
		    {
		      for (n = 0; *sss; sss++, n++)
			{
			  char c;
			  c = *sss;
			  if (!n)
			    {
			      if (WantFirstCharLower)
				c = DiacriticalTolower (c);
			      else if (WantFirstCharUpper)
				c = DiacriticalToupper (c);
			    }
			  else
			    {
			      if (WantRestOfWordLower)
				c = DiacriticalTolower (c);
			    }
			  if (c == '\'')
			    {
			      if (AddMarkup (Dataset, k, MarkRsquo, c))
				goto DiskError;
			    }
			  else
			    {
			      if (AddMarkup (Dataset, k, MarkInsertChar, c))
				goto DiskError;
			    }
			}
		      for (sss = Dataset->Words->Words[i].Full; *sss;
			   sss++, k++)
			if (AddMarkup (Dataset, k, MarkRemoveChar, 0))
			  goto DiskError;
		      Status->TripSquote = k;
		    }
		  else
		    k += strlen (Full);
		  if (WantToItalicize)
		    {
		      if (AddMarkup (Dataset, k, MarkEndItalics, 0))
			goto DiskError;
		    }
		}		// DiacriticalNormalize ... 
	    }			// for ... 
	  Status->SentenceStart = 0;
	}

      // "Smart" double-quotes.   
      if (*ss == '\"' && !IsEndSpace (ss[1]) && (ss == s || isspace (ss[-1])))
	{
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s,
	       MarkBeginSmartQuote, 0))
	    goto DiskError;
	  continue;
	}
      if (*ss == '\"' && ss >= &s[2] && ss[-1] == '-' && ss[-2] == '-' && 
          ss[1] != 0 && !isspace (ss[1]))
	{
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s,
	       MarkBeginSmartQuote, 0))
	    goto DiskError;
	  continue;
	}
      if (*ss == '\"' && ss > s && !isspace (ss[-1]) &&
	  (IsEndSpace (ss[1]) || (IsEndPunct (ss[1]) && IsEndSpace (ss[2]))))
	{
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s,
	       MarkEndSmartQuote, 0))
	    goto DiskError;
	  continue;
	}
      if (*ss == '\"' && ss[1] == '-' && ss[2] == '-' && ss != s && 
          !isspace (ss[-1]))
	{
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s,
	       MarkEndSmartQuote, 0))
	    goto DiskError;
	  continue;
	}

      // "Smart" single-quotes.  This differs a little from  
      // double-quotes, because we interpret ALL single-quotes as 
      // right-quotes UNLESS they fulfill reasonable criteria for 
      // being a left-hand quote AND an ending single quote appears in 
      // the not-too distant future (defined as the end of the line). 
      if (*ss == '\'')
	{
	  int Right = 1;
	  // This looks for 'em, 'til, etc.
	  if (ss != s && !isalpha (ss[-1]))
	    {
	      int i;
	      for (i = 0; i < NUM_SINGLE_QUOTE_WORDS; i++)
	        if (!strncasecmp (ss + 1, SingleQuoteWords[i], LengthSingleQuoteWords[i]))
		  if (!isalpha (ss[1 + LengthSingleQuoteWords[i]]))
		    {
		      Right = 2;
		      break;
		    }
	    }
	  if (ss >= &s[2] && ss[-1] == '-' && ss[-2] == '-' && 
	      ss[1] != 0 && !isspace (ss[1]))
	    {
	      Right = 2;
	    }
	  if (Right == 1 && !IsEndSpace (ss[1])
	      && (ss == s || isspace (ss[-1]) || ss[-1] == '\"'))
	    {
	      char *sss;

	      // Reasonable requirements for an opening quote are 
	      // fulfilled.  Now let's look to see if we can find a 
	      // trailing quote prior to the next candidate for opening 
	      // quote. 
	      for (sss = ss + 1; *sss; sss++)
		if (*sss == '\'')
		  {
		    if (!IsEndSpace (sss[1])
			&& (isspace (sss[-1]) || sss[-1] == '\"'))
		      break;
		    if (!isspace (sss[-1]) &&
			(IsEndSpace (sss[1]) || sss[1] == '\"'
			 || (IsEndPunct (sss[1])
			     && (IsEndSpace (sss[2]) || sss[1] == '\"'))))
		      {
			Right = 0;
			break;
		      }
		  }
	    }
	  if (Right)
	    {
	      k = Status->LineInfo[0].Offset + ss - s;

	      // It may be that this quote was already added as part 
	      // of a WORD, and so adding new markup for it would 
	      // result in the appearance of two quotes. 
	      if (k >= Status->TripSquote)
		{
		  if (AddMarkup (Dataset, k, MarkRsquo, 0))
		    goto DiskError;
		}
	    }
	  else
	    {
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s, MarkLsquo,
		   0))
		goto DiskError;
	    }
	  continue;
	}

      // Find multiple dashes.  Replace even-length strings of 
      // dashes with half as many mdashes, and odd-length strings 
      // (>1) with mdashes plus a trailing ndash.  We also convert 
      // " - " to " &mdash; ". 
      if (*ss == '-' && !Dataset->NoMdash
	  && !Status->LineInfo[0].WeirdSequences)
	{
	  if (ss > s && isspace (ss[-1]) && isspace (ss[1]))
	    {
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s,
		   MarkInsertMdash, 0))
		goto DiskError;
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s,
		   MarkRemoveChar, 0))
		goto DiskError;
	      ss++;
	      Status->LastFirstSpace = Status->LineInfo[0].Offset + ss - s;
	    }
	  else if (ss[1] == '-')
	    {
	      while (*ss == '-')
		{
		  if (ss[1] != '-')
		    {
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + ss - s,
			   MarkInsertNdash, 0))
			goto DiskError;
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + ss - s,
			   MarkRemoveChar, 0))
			goto DiskError;
		      ss++;
		    }
		  else
		    {
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + ss - s,
			   MarkInsertMdash, 0))
			goto DiskError;
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + ss - s,
			   MarkRemoveChar, 0))
			goto DiskError;
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + ss - s + 1,
			   MarkRemoveChar, 0))
			goto DiskError;
		      ss += 2;
		    }
		}
	      if (*ss != '.' && *ss != '?' && *ss != '!')
		{
		  if (AddMarkup
		      (Dataset, Status->LineInfo[0].Offset + ss - s,
		       MarkSoftHyphen, 0))
		    goto DiskError;
		}
	      ss--;
	      continue;
	    }
	}

      // *** Italicizing ***

      // End italics on </I>. 
      if (Status->Italicizing == ITALICS_HTML && !strncasecmp (ss, "</i>", 4))
	{
	  j = 4;
	EndItalics:		// Set j before coming here. 
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s, MarkEndItalics,
	       0))
	    goto DiskError;
	  for (; j > 0; j--)
	    {
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s,
		   MarkRemoveChar, 0))
		goto DiskError;
	      if (j > 1)
		ss++;
	    }
	  Status->Italicizing = ITALICS_NONE;
	  continue;
	}

      // ... and other italicizing styles.
      if (Status->Italicizing >= ITALICS_BRACKET)
	if (!strncmp
	    (ss, ItalicizingStyles[Status->Italicizing].End,
	     ItalicizingStyles[Status->Italicizing].Length))
	  {
	    j = ItalicizingStyles[Status->Italicizing].Length;
	    goto EndItalics;
	  }

      // Begin italicizing when <I> encountered. 
      if (Status->Italicizing == ITALICS_NONE
	  && Status->BracketItalicsCount < 5 && !strncasecmp (ss, "<i>", 3))
	{
	  j = 3;
	  Status->Italicizing = ITALICS_HTML;
	StartItalics:		// Set j and Status->Italicizing before coming.   
	  if (AddMarkup
	      (Dataset, Status->LineInfo[0].Offset + ss - s, MarkBeginItalics,
	       0))
	    goto DiskError;
	  // If we hit an arbitrary ceiling for explicit italicizing -- 
	  // such as _emphasized_ -- we begin to disallow ALL-CAPS conversion
	  // to italics.  
	  //
	  // We don't want to include things like "_I_" in the count,
	  // because you'll often have constructs like this in the
	  // ALL-CAPS style, and this may result in inappropriate
	  // cancellation of ALL-CAPS conversion.  We don't have any
	  // good way to detect this condition, so we just look to 
	  // see if the character after the delimiter is 'I'.
	  if (Status->Italicizing != ITALICS_ALLCAPS && 'I' != ss[j])
	    {
	      Status->NonCapsItalicsCount++;
	      if (Status->NonCapsItalicsCount > NO_ALLCAPS_TRIGGER)
	        Dataset->CapsOk = 1;
	    }
	  for (; j > 0; j--)
	    {
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s,
		   MarkRemoveChar, 0))
		goto DiskError;
	      if (j > 1)
		ss++;
	    }
	  continue;
	}

      // Treat the <emphasis> style differently, since we have to
      // do various odd things to distinguish it from this
      // <i>emphasis</i> style. 
      if (Status->Italicizing == ITALICS_NONE && *ss == '<'
	  && Status->BracketItalicsCount > 5)
	goto BracketEntry;
      if (AtWordStart (s, ss, "<"))
	{
	BracketEntry:j = 1;
	  if (ss[1] == '/')
	    Status->BracketItalicsCount = -30000;
	  else if (Status->BracketItalicsCount <= 5 && ss[1] != 0
		   && ss[2] == '>')
	    ;
	  else
	    {
	      Status->Italicizing = ITALICS_BRACKET;
	      Status->BracketItalicsCount++;
	      goto StartItalics;
	    }
	}

      // ... and the other styles.
      if (Status->Italicizing == ITALICS_NONE)
	for (j = ITALICS_UNNAMED; j < NUM_ITALICIZING_STYLES; j++)
	  if (AtWordStart (s, ss, ItalicizingStyles[j].Start))
	    {
	      Status->Italicizing = j;
	      j = ItalicizingStyles[j].Length;
	      goto StartItalics;
	    }

      // Words to auto-italicize. This shouldn't be necessary any
      // more, since wordlists have supplanted it.  But, it appears
      // to work, so ...
      if (Status->Italicizing == ITALICS_NONE)
	{
	  for (j = 0; j < NUM_AUTO_ITALICIZED; j++)
	    if (0 == strncmp (ss, Italicized[j], LengthItalicized[j]))
	      {
		if (ss > s && !isspace (ss[-1]))
		  continue;
		if (AddMarkup
		    (Dataset, Status->LineInfo[0].Offset + ss - s,
		     MarkBeginItalics, 0))
		  goto DiskError;
		ss += LengthItalicized[j] - 1;
		if (AddMarkup
		    (Dataset, Status->LineInfo[0].Offset + ss - s,
		     MarkEndItalics, 0))
		  goto DiskError;
		break;
	      }
	  if (j < NUM_AUTO_ITALICIZED)
	    continue;
	}

      // Detect a hanging single '-' at the end  
      // of a line.  If the hanging dash is not preceded by whitespace,  
      // like " - " or " -- " then it is undoubtedly part of  
      // something like "to-day" and needs to have the trailing space  
      // removed, or else in html it will render as "to- day".  Of course, 
      // there's also the larger question of whether it should be reduced  
      // simply to "today",  We'll worry about that one later ... 
      if (isspace (*ss) && ss > &s[1] && ss[-1] == '-'
	  && ss[-2] != '-' && !Status->LineInfo[1].Empty
	  && !Status->LineInfo[1].BeginsWhite)
	{

	  // Search backward for the first non-dash encountered. 
	  for (sss = ss - 2; sss >= s && *sss == '-'; sss--);
	  if (sss >= s && isalpha (*sss))
	    {
	      for (sss = ss + 1; isspace (*sss); sss++);
	      if (*sss == '\0')	// Entire remainder of line blank. 
		{
		  for (sss = ss; *sss; sss++)
		    {
		      if (AddMarkup
			  (Dataset, Status->LineInfo[0].Offset + sss - s,
			   MarkRemoveChar, 0))
			goto DiskError;
		    }
		  ss = sss - 1;
		  continue;
		}
	    }
	}

      // Detect position where two spaces are needed (namely, after colon 
      // or end-of-sentence). 
      if (isspace (*ss))
	{
	  Status->CouldBeName = 1;
	  sss = ss - 1;
	  if (sss > s && (*sss == '\"' || *sss == '\''))
	    {
	      // 11/22/02 RSB.  There's a special case here we have to deal 
	      // with in that things like
	      //	... ?" Something
	      //	... !" Something
	      // COULD be ends of sentences, but statistically are more likely
	      // to something like the following:
	      //	"I am home!" John said.
	      // Therefore, it is MORE LIKELY that a single space is needed
	      // rather than a double space.  We have no good way of knowing
	      // for sure.  In my view, a single space where there is supposed
	      // to be a double-space is less-annoying looking than the reverse
	      // anyhow.  The bottom line, is that we detect this case specially.
	      if (sss[-1] != '!' && sss[-1] != '?')
	        {
	          Status->CouldBeName = 0;
	          sss--;
	        }
	    }
	  if (sss > s && *sss == ':' && !Dataset->SingleSpace)
	    {
	      if (AddMarkup
		  (Dataset, Status->LineInfo[0].Offset + ss - s, MarkNbsp, 0))
		goto DiskError;
	    }
	  else if (sss > s && (*sss == '.' || *sss == '!' || *sss == '?'))
	    {
	      if (*sss != '.')
		Status->CouldBeName = 0;

	      // We want to check whether the next character is  
	      // capitalized. 
	      for (siv = ss + 1; isspace (*siv); siv++);
	      if (*siv == '\0')	// Next character is on next line. 
		{
		  if (!Status->LineInfo[1].CapFirstChar)
		    Status->CouldBeName = 0;
		  if (Status->LineInfo[1].VerseCap)
		    goto CapitalizedAfterPeriod;
		}
	      else		// Next character is on the same line. 
		{
		  if (*siv == '\"' || *siv == '\'')
		    {
		      Status->CouldBeName = 0;
		      siv++;
		    }
		  if (*siv >= 'A' && *siv <= 'Z')
		    {
		    CapitalizedAfterPeriod:

		      // Okay, we know that the next char is capitalized,  
		      // but it could still be part of a construction like 
		      // "Mr. Smith".  So we need to detect various 
		      // honorifics and eliminate them.  Also, we detect 
		      // single-letter capitalized abbreviations, as these 
		      // could be initials. 
		      if (Status->CouldBeName)
			{
			  if (sss > s && sss[-1] >= 'A'
			      && sss[-1] <= 'Z' && (sss == s + 1
						    || isspace (sss[-2])))
			    {

			      // Is something like an initial!  No action  
			      // needed. 
			    }
			  else
			    {
			      for (j = 0;
				   j <
				   sizeof (Honorifics) /
				   sizeof (Honorifics[0]); j++)
				{
				  for (siv = sss - 1, sv = Honorifics[j];
				       siv >= s && *sv; siv--, sv++)
				    if (*siv != *sv)
				      break;
				  if (*sv == '\0')
				    break;
				}
			      if (j >=
				  sizeof (Honorifics) /
				  sizeof (Honorifics[0]))
				Status->CouldBeName = 0;
			    }
			}

		      // Finally, we believe that we do need to add the 
		      // extra space. 
		      if (!Status->CouldBeName && !Dataset->SingleSpace)
			{
			  if (AddMarkup
			      (Dataset, Status->LineInfo[0].Offset + ss - s,
			       MarkNbsp, 0))
			    goto DiskError;
			}
		    }
		}
	    }
	}
    }
  if (ss != s && !isspace (ss[-1]))
    Status->LastFirstSpace = Status->LineInfo[0].Offset + ss - s;

  return (0);
DiskError:
  return (5);
}
