/////////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	CheckGutenbergHeader.c
  Purpose:	This analyzes the input text file to see if it has a 
  		Project Gutenberg header.
  Mods:		08/31/01 RSB	Began.
  		11/02/01 RSB	Added GPL disclaimer and reformatted 
				somewhat for first web release.
		12/27/01 RSB	Now add MarkEndGutenbergHeader when the 
				header is not found (allowing non-PG
				files to be marked up).
  
  There's no certain way to detect the Project Gutenberg header.  This 
  function applies the following heuristic:  
  
  1.	The phrase "Project Gutenberg" and the word "etext" must appear 
  	in some line within the first 50 lines of the file; and

  2.	The string "*end*" must appear within the first 500 lines of the 
  	file.
  
  If so, the header is assumed to begin with the first non-blank line of 
  the file.  The first actual line of text is assumed to be the first 
  non-blank line after the line containing "*end*".      
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "AutoMark.h"
#define START_MAX 50
#define END_MAX 500

//------------------------------------------------------------------------
// Convert a string to upper case.

static void
StrUpr (char *s)
{
  for (; *s != '\0'; s++)
    *s = toupper (*s);
}

//------------------------------------------------------------------------

int
CheckGutenbergHeader (AnalysisDataset * Dataset)
{
  int Count;
  char s[256], ss[256];
  unsigned long Offset = 0;
  MarkupRecord Mark;

  Mark.Offset = 0;
  s[sizeof (s) - 1] = 0;
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);

  // Search for the start of the header.
  for (Count = 0; Count < START_MAX; Count++)
    {
      if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile))
	goto Bypass;
      StrUpr (s);
      if (NULL != strstr (s, "PROJECT GUTENBERG")
	  && NULL != strstr (s, "ETEXT"))
	break;
    }
  if (Count >= START_MAX)
    goto Bypass;

  // Search for the end of the header.
  for (; Count < END_MAX; Count++)
    {
      if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile))
	goto Bypass;
      StrUpr (s);
      if (NULL != strstr (s, "*END*"))
	break;
    }
  if (Count >= END_MAX)
    goto Bypass;

  // Okay, find the first non-blank line.
  for (;;)
    {
      Offset = ftell (Dataset->InputFile);
      if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile))
	goto Bypass;

      // See if the line has any content.
      if (1 == sscanf (s, "%s", ss))
	break;
    }

  // Well, it has been found.  Let's add the markups for the beginning and
  // the end of the header.
  Mark.Offset = 0;
  Mark.Type = MarkJumpPastGutenbergHeader;
  fwrite (&Mark, sizeof (Mark), 1, Dataset->MarkupFile);
  Mark.Offset = Offset;
Bypass:			// Come here if no header found.
  Mark.Type = MarkEndOfGutenbergHeader;
  fwrite (&Mark, sizeof (Mark), 1, Dataset->MarkupFile);
  Dataset->TextStart = Offset;
  return (Mark.Offset != 0);
}
