///////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2008 Ronald S. Burkey <info@sandroid.org>
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	StanfordTest.c 
  Purpose:	A GutenMark pass using the "Stanford Parser". 
		http://nlp.stanford.edu/software/lex-parser.shtml. 
  Mods:		03/01/2008 RSB	Began.

  I only discovered the Stanford natural-language arser yesterday, but it 
  looks great, at least for English-language texts.  My idea right now is
  that it can be run as a separate pass of GutenMark in order to improve
  certain aspects of GutenMark such as smart quotes and location of headings.
  With luck, it may help with a lot of other problematic aspects as well,
  such as detection of verse.  But it's still early going....
  
  The principal characteristics of the parser that interest me are:
  
  a.	It can distinguish a sentence from a non-sentence.
  b.	It can distinguish left-quotes from right-quotes.
  c.	It can strings of punctuation, such as "--" or "...".
  
  The basic idea is that starting from a PG text, I can do this:
  
  1.  	Strip away the PG file-header and file-footer, if any, leaving just
  	the pure text.
  2.	Add markup to this plain text (to handle such basic GutenMark stuff
  	as italics) using some kind of strings that the parser will ignore
	without messing up the parsing, but will pass in a recognizable form
	to the parsed output.  A lot of stuff seems like it will work for 
	this, but in the absence of any guidance so far from the Stanford
	Parser project (I've sent them an email, inquiring), stuff like
	"{r%d}" seems to work fine.  I'll simply choose a different integer
	to indicate each kind of GutenMark markup I want to add.
  3.	Parse the complete thing using the Stanford Parser.  The parser
  	takes a *long* time to run, so I may decide later to run it on just
	problematic portions of the text such as heading candidates.  For 
	example, it takes about 30 minutes on a 3GHz machine to parse a
	256K file.  An "average" PG text will probably take about an hour
	to parse.  This is obviously too much for the casual user, but is
	well worth it for "professional" work.  Besides, I'll add a 
	command-line switch to active the parser anyhow.  The parser can 
	provide several kinds of output, of which the ones that interest
	me are text marked up with "Penn Treebank" annotations and a set
	of tree structures.  From the former the original text can be
	recreated but in a better way (for example, straight quotes can 
	be replaced by left quotes and right-quotes), while from the latter
	a text string can be tested to see if it is a complete sentence or not.
  4.	Reform the text into an improved original, as well as using deductions
  	from the parsed output to improve the inferences made by the other 
	GutenMark passes.  The principal inference improvement I anticipate
	is that since headings are typically not complete sentences ...
	although they *can* be ... recognition of headings will be 
	improved.
*/

#include "AutoMark.h"
#include <stdlib.h>

static int OutputStanfordInput (const char *Filename, AnalysisDataset *Dataset);
static char ShellCommand[1024];

int
StanfordPass (AnalysisDataset * Dataset)
{
  int ReturnValue = 1, i;

  printf ("Running the Stanford NLP pass ...\n");
  
  // Steps 1,2:  Create a file for the parser to parse.  We call this file
  // InputToParser.tmp, and put it in the current directory.  What we do is 
  // to put the contents of both the original text file and the markup file
  // that has been created into this single file.  At this point, we don't 
  // need the text and the markup file any longer.  After the parser has 
  // run, we'll create new text and markup files for the output passes.
  // This is a big departure from the other GutenMark pasees, where the 
  // original text is never touched, but we do it for two reasons:  Firstly,
  // we need to, since there's no way to match the positions of the markup
  // to the output of the parser otherwise.  Secondly, the reason we avoid
  // touching te original in the other GutenMark passes is to save time; but
  // the parser path takse so long that creating a new text file for it to 
  // work on is infinitesimal in comparison.
  if (OutputStanfordInput ("InputToParser.tmp", Dataset))
    goto Error;
  
  // Step 3:  Run the parser.  This is the easy part.  It takes a long time,
  // but all we have to do is to sit back and watch!
  sprintf (ShellCommand,
           "%s -server -mx150m -cp \"%sstanford-parser.jar:\" "
	   "edu.stanford.nlp.parser.lexparser.LexicalizedParser "
	   "-outputFormat \"wordsAndTags,penn\" "
	   "%senglishPCFG.ser.gz InputToParser.tmp",
	   Dataset->PathToJava, Dataset->PathToParser, Dataset->PathToParser);
  printf ("%s\n", ShellCommand);
  i = system (ShellCommand);	// Shell out to run it!	   
  //unlink ("InputToParser.tmp");	// Get rid of the temporary file.	   
  
  // Step 4:  Reform the parsed output into a file that the output passes
  // can recognize.  In other words, create a pure-text file and a markup
  // file.
  
  
  ReturnValue = 0;
Error:
  return (ReturnValue);
}

//--------------------------------------------------------------------------
// Creates the temporary file used as input to the parser by reading the
// original text and the markup file.  It is based on OutputHtml.c, greatly
// simplified, but uses the strings "{r%d}" in place of any HTML, as the 
// parser seems to pass them through to the output without affecting the
// parsing.  (I found this empirically, so there's no guarantee, I suppose.)  
// Returns 0 on success, non-zero on error.

static int
OutputStanfordInput (const char *Filename, AnalysisDataset * Dataset)
{
  FILE *OutputFile;
  int ErrorCode, c, InPrefatoryArea = 1;
  MarkupRecord Mark;
  unsigned long Offset;
  
  OutputFile = fopen (Filename, "w");
  if (OutputFile == NULL)
    return (1);

  fseek (Dataset->MarkupFile, 0, SEEK_SET);
  fseek (Dataset->InputFile, 0, SEEK_SET);
  
  // In all cases, try to skip past the PG header.
  do
    {
      ErrorCode = fread (&Mark, sizeof (Mark), 1, Dataset->MarkupFile);
      if (ErrorCode != 1)
	{
	  Mark.Type = MarkNoMoreMarks;
	  break;
	}
    }
  while (Mark.Offset < Dataset->TextStart);
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  Offset = Dataset->TextStart;
  Dataset->OffsetOfActualText = Offset;
    
  // Now loop through the remaining text and markers.  Ignore everything
  // in the "prefatory" area.
  for (c = 0; (c = fgetc (Dataset->InputFile)) != EOF; Offset++)
    {
      while (Offset == Mark.Offset && Mark.Type != MarkNoMoreMarks)
	{
	  if (InPrefatoryArea && Mark.Type == MarkHeader1)
	    { 
	      Dataset->OffsetOfActualText = Offset;
	      InPrefatoryArea = 0;
	    }
	  if (!InPrefatoryArea)
	    {
	      // Various marks are ignored (and thus omitted here) when
	      // the Stanford Parser is used, because the parser does it
	      // better than the standard GutenMark algorithm.
	      if (Mark.Type != MarkRemoveChar &&
	          Mark.Type != MarkNbsp &&
		  Mark.Type != MarkInsertMdash &&
		  Mark.Type != MarkInsertNdash &&
		  Mark.Type != MarkBeginSmartQuote &&
		  Mark.Type != MarkEndSmartQuote &&
		  Mark.Type != MarkLsquo &&
		  Mark.Type != MarkRsquo &&
		  Mark.Type != MarkSoftHyphen)
	        fprintf (OutputFile, "{r%d}", Mark.Type);
	    }
	  ErrorCode = fread (&Mark, sizeof (Mark), 1, Dataset->MarkupFile);
	  if (ErrorCode != 1)
	    Mark.Type = MarkNoMoreMarks;
	}
      if (!InPrefatoryArea)
        putc (c, OutputFile);
    }
  
  fclose (OutputFile);
  return (0);
}

