#include "udm_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <regex.h>
#include <signal.h>

#if (WIN32|WINNT)
#include <time.h>
#endif

#ifdef HAVE_SYS_TIME_H
#include <sys/time.h>
#endif

#ifdef HAVE_LIBUTIL_H
#include <libutil.h>
#endif

#include "udm_utils.h"
#include "udm_common.h"
#include "udm_log.h"
#include "udm_conf.h"
#include "udm_indexer.h"
#include "udm_robots.h"
#include "udm_db.h"
#include "udm_parseurl.h"
#include "udm_charset.h"
#include "udm_parser.h"
#include "udm_proto.h"
#include "udm_hrefs.h"
#include "udm_mutex.h"
#include "udm_crc32.h"
#include "udm_xmalloc.h"
#include "udm_http.h"
#include "udm_id3.h"
#include "udm_host.h"
#include "udm_server.h"
#include "udm_filter.h"
#include "udm_alias.h"
#include "udm_word.h"
#include "udm_crossword.h"
#include "udm_parsehtml.h"
#include "udm_cache.h"
#include "udm_spell.h"
#include "udm_execget.h"
#include "udm_agent.h"
#include "udm_mimetype.h"

#ifdef NEWS_EXT
#include "udm_parsedate.h"
#endif

#define UDM_THREADINFO(ThreadInfo,h,s,m)	if(ThreadInfo)ThreadInfo(h,s,m)

/****************************************************************/
static int cmplang(const void *s1,const void *s2){
	return(((const UDM_LANG*)s2)->count-((const UDM_LANG*)s1)->count);
}


static void FreeDoc(UDM_DOCUMENT *Result){
	if(!Result)return;
	UDM_FREE(Result->url);
	UDM_FREE(Result->content_type);
	UDM_FREE(Result->title);
	UDM_FREE(Result->text);
	UDM_FREE(Result->keywords);
	UDM_FREE(Result->description);
	UDM_FREE(Result->category);
	/*UDM_FREE(Result->content); FixMe: does this really need ?  */
	free(Result);
}


/*********************** 'UrlFile' stuff (for -f option) *******************/

__INDLIB__ int UdmURLFile(UDM_AGENT *Indexer, int action){
	FILE *url_file;
	char str[UDMSTRSIZ]="";
	char str1[UDMSTRSIZ]="";
	int result,res;
	UDM_URL myurl;

	/* Read lines and clear/insert/check URLs                     */
	/* We've already tested in main.c to make sure it can be read */
	/* FIXME !!! Checking should be done here surely              */

	if(!strcmp(Indexer->Conf->url_file_name,"-"))
		url_file=stdin;
	else
		url_file=fopen(Indexer->Conf->url_file_name,"r");

	while(fgets(str1,sizeof(str1),url_file)){
		char *end;
		if(!str1[0])continue;
		end=str1+strlen(str1)-1;
		while((end>=str1)&&(*end=='\r'||*end=='\n')){
			*end=0;if(end>str1)end--;
		}
		if(!str1[0])continue;
		if(str1[0]=='#')continue;

		if(*end=='\\'){
			*end=0;strcat(str,str1);
			continue;
		}
		strcat(str,str1);
		strcpy(str1,"");

		switch(action){
		case UDM_URL_FILE_REINDEX:
			UdmAddURLLimit(Indexer->Conf,str);
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			Indexer->Conf->have_targets=1;
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			result=UdmMarkForReindex(Indexer);
			if(result!=IND_OK)return(result);
			UdmClearURLLimit(Indexer->Conf);
			break;
		case UDM_URL_FILE_CLEAR:
			UdmAddURLLimit(Indexer->Conf,str);
			result=UdmClearDB(Indexer);
			if(result!=IND_OK)return(IND_ERROR);
			UdmClearURLLimit(Indexer->Conf);
			break;
		case UDM_URL_FILE_INSERT:
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			if(UdmAddHref(Indexer->Conf,str,0,0,0,NULL,NULL))
				Indexer->Conf->have_targets=1;
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			
			break;
		case UDM_URL_FILE_PARSE:
			res=UdmParseURL(&myurl,str);
			if((!res)&&(!myurl.schema[0]))
				res=UDM_PARSEURL_BAD;
			if(res){
				switch(res){
				case UDM_PARSEURL_LONG:
					UdmLog(Indexer,UDM_LOG_ERROR,"URL too long: '%s'",str);
					break;
				case UDM_PARSEURL_BAD:
				default:
					UdmLog(Indexer,UDM_LOG_ERROR,"Error in URL: '%s'",str);
				}
				return(IND_ERROR);
			}
			break;
		}
		str[0]=0;
	}
	if(url_file!=stdin)
		fclose(url_file);
	return(IND_OK);
}


/******* Main indexer functions StartUp & IndexNextURL ********/

/* Init section for indexing */

static int StartUp(UDM_AGENT * Indexer, int index_flags){
	char *tok,*lt;
	char tables[UDMSTRSIZ];

	if(Indexer->Conf->DBMode==UDM_DBMODE_CACHE){
		if(IND_OK!=UdmOpenCache(Indexer->Conf,UdmDBErrorMsg(Indexer->db))){
			return(1);
		}
	}
	Indexer->Conf->have_targets=1;
	if(index_flags&UDM_FLAG_MARK){
		if(Indexer->Conf->url_file_name)
			UdmURLFile(Indexer,UDM_URL_FILE_REINDEX);

		UdmMarkForReindex(Indexer);
		if(UdmDBErrorCode(Indexer->db)){
			return(1);
		}
	}
	if(index_flags&UDM_FLAG_SKIP_LOCKING)
		Indexer->Conf->DBUseLock=0;
	else
		Indexer->Conf->DBUseLock=1;

	if((Indexer->Conf->url_file_name) && (index_flags&UDM_FLAG_INSERT)) {
		UdmURLFile(Indexer,UDM_URL_FILE_INSERT);
		if(UdmDBErrorCode(Indexer->db)){
			return(1);
		}
	}

	/* Now load all stopword tables from database            */
	/* Their names are in Indexer->Conf->stop_tables string  */

	strcpy(tables,Indexer->Conf->stop_tables);
	tok=UdmGetToken(tables," \r\n\t",&lt);
	while(tok){
		UdmLog(Indexer,UDM_LOG_DEBUG,"Load stopword table '%s'",tok);
		if(UdmLoadStopList(Indexer,tok)!=IND_OK)
			return(1);
		tok=UdmGetToken(NULL," \r\n\t",&lt);
	}

	/* Load robot.txt information */
	if(UdmLoadRobots(Indexer)!=IND_OK)
		return(1);

	/* load ispell data in IspellMode db */
	if (Indexer->Conf->ispell_mode & UDM_ISPELL_MODE_DB) {
		if (UdmDBImportAffixes(Indexer,Indexer->charset)) return(1);
		if (UdmImportDictionaryFromDB(Indexer)) return(1);
		if(Indexer->Conf->nspell) {
		  UdmSortDictionary(Indexer->Conf);
		  UdmSortAffixes(Indexer->Conf);
		}
	}

	/* Now load all server tables from database            */
	/* Their names are in Indexer->Conf->srv_tables string */

	strcpy(tables,Indexer->Conf->srv_tables);
	tok=UdmGetToken(tables," \r\n\t",&lt);
	while(tok){
		UdmLog(Indexer,UDM_LOG_DEBUG,"Load server table '%s'",tok);
		if(UdmLoadServerTable(Indexer,tok,index_flags)!=IND_OK)
			return(1);
		tok=UdmGetToken(NULL," \r\n\t",&lt);
	}
	return(0);
}


__INDLIB__ int UdmIndexNextURL(UDM_AGENT *Indexer,int index_flags){
char request[UDMSTRSIZ]="";
char reason[UDMSTRSIZ]="";
udmcrc32_t crc32;

char *lt,*tok;
char *header,*content_type,*location,*statusline;
char *hcopy=NULL;

int size,realsize;
int status=0,Method=0;
int origin,follow,do_index,changed;
int reindex;
int found_in_mirror = 0;
int result=IND_UNKNOWN;

UDM_URL		CurURL;
UDM_URL		realURL;
UDM_DOCUMENT	*Doc=NULL;
UDM_SERVER	*CurSrv=0;
UDM_ALIAS	*Alias;


char text_escaped[2*UDM_MAXTEXTSIZE+1]="";
char keywords_escaped[2*UDM_MAXKEYWORDSIZE+1]="";
char descript_escaped[2*UDM_MAXDESCSIZE+1]="";
char title_escaped[UDM_MAXTITLESIZE*2+1]="";

time_t last_mod_time=0;
char buf[UDM_MAXTIMESTRLEN]="";
char subj[UDM_MAXTITLESIZE]="";
char from[UDM_MAXKEYWORDSIZE]="";
char content_type_escaped[256]="";
char aliastr[UDMSTRSIZ]="";
char emptystr[]="";

#ifdef NEWS_EXT
/* I need to retain newsgroup-info, so I introduce new variables */
/* I also stick with these fixed-length strings, although I dislike them.*/
char HeaderDate[UDM_MAXDATESIZE] = "";
char HeaderDateEsc[2*UDM_MAXDATESIZE+1] = "";
char HeaderSubject[UDM_MAXSUBJSIZE] = "";
char HeaderSubjectEsc[2*UDM_MAXSUBJSIZE+1] = "";
char HeaderFrom[UDM_MAXFROMSIZE] = "";
char HeaderFromEsc[2*UDM_MAXFROMSIZE+1] = "";
char HeaderGroup[UDM_MAXGROUPSIZE] = "";
char HeaderGroupEsc[2*UDM_MAXGROUPSIZE+1] = "";
char HeaderRefs[UDM_MAXREFSIZE] = "";
char HeaderRefsEsc[2*UDM_MAXREFSIZE+1] = "";
char MessageIdEsc[2*UDM_MAXFROMSIZE+1]="";
char *parent=NULL, *SQLDate;
#endif

#ifdef USE_PARSER
char *mime;
int mimeno;
#endif

int is_mp3=0, mp3_size=0;

	/* Do init stuff if required */
	if(index_flags&UDM_FLAG_INIT){
		if(StartUp(Indexer, index_flags))
			return(IND_ERROR);
		else
			return(IND_OK);
	}else{
		int j;
		/* Clear language statistics */
		for(j=0;j<UDM_LANGPERDOC;j++){
			Indexer->lang[j].lang[0]=0;
			Indexer->lang[j].count=0;
		}
		Indexer->nlangs = 0;
	}

	if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
	/* Store URLs from cache into database */
	if(UdmStoreHrefs(Indexer))Indexer->Conf->have_targets=1;
	if(UdmDBErrorCode(Indexer->db)){
		result=IND_ERROR;
	}else
	if(!Indexer->Conf->have_targets){
		result=IND_NO_TARGET;
	}else{
		/* Get Next URL to be indexed from the database */
		UDM_THREADINFO(Indexer->Conf->ThreadInfo,Indexer->handle,"Selecting","");
		if(!(Doc=UdmGetDocInfo(Indexer,index_flags))){
			if(UdmDBErrorCode(Indexer->db)){
				result=IND_ERROR;
			}else{
				Indexer->Conf->have_targets=0;
				result=IND_NO_TARGET;
			}
		}	
	}
	if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);

	/* In GUI version we may send "terminate" signal       */
	/* This line checks whether it already has happened    */
	/* It allows terminate indexing faster                 */
	/* We include this line in several places of this func */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;
	if(result)return(result);

	/* Alloc buffer for document */
	if(!Indexer->buf){
		Indexer->buf=(char*)UdmXmalloc((size_t)Indexer->Conf->max_doc_size);
	}
	UdmLog(Indexer,UDM_LOG_INFO,"%s",Doc->url);
	UDM_THREADINFO(Indexer->Conf->ThreadInfo,Indexer->handle,"Found",Doc->url);
	reindex=index_flags&UDM_FLAG_REINDEX;

#ifdef HAVE_SETPROCTITLE

	/* To see the URL being indexed   */
	/* in "ps" output on FreeBSD      */
	/* Do it if single thread version */

	if(!(Indexer->handle))
		setproctitle("%s",Doc->url);
#endif

	/* Check that URL has valid syntax */
	if(UdmParseURL(&CurURL,Doc->url)){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		UdmLog(Indexer,UDM_LOG_WARN,"Invalid URL: %s ... deleting",Doc->url);
		FreeDoc(Doc);
		return(result);
	}

	/* Find correspondent Server record from indexer.conf */
	if(!(CurSrv=UdmFindServer(Indexer->Conf,Doc->url,aliastr))){
		UdmLog(Indexer,UDM_LOG_WARN,"No 'Server' command for url... deleted.");
		if(!strcmp(CurURL.filename,"robots.txt")){
			if(IND_OK==(result=UdmDeleteRobotsFromHost(Indexer,CurURL.hostinfo)))
				result=UdmLoadRobots(Indexer);
		}else{
			result=IND_OK;
		}
		if(result==IND_OK)result=UdmDeleteUrl(Indexer,Doc->url_id);
		FreeDoc(Doc);
		return(result);
	}
	

	if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
	/* Add current URL in memory cache to avoid */
	/* possible double INSERT INTO url          */
	UdmAddHref(Indexer->Conf,Doc->url,0,Doc->hops,1,CurSrv->tag,CurSrv->category);
	Indexer->Conf->have_targets=1;
	if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);


	switch(UDM_SRV_TYPE(CurSrv->match_type)){
		case UDM_SERVER_STRING:
			UdmLog(Indexer,UDM_LOG_DEBUG,"Realm string '%s'",CurSrv->url);
			break;
		case UDM_SERVER_REGEX:
			UdmLog(Indexer,UDM_LOG_DEBUG,"Realm regex '%s'",CurSrv->url);
			break;
		case UDM_SERVER_SUBSTR:
		default:
			UdmLog(Indexer,UDM_LOG_DEBUG,"Server '%s'",CurSrv->url);
			break;
	}
	Indexer->doc_timeout=CurSrv->doc_timeout;
	Indexer->read_timeout=CurSrv->read_timeout;
	Indexer->maxsize=Indexer->Conf->max_doc_size;
	Indexer->wordpos=0;

	/* Check that hops is less than MaxHops */
	if(Doc->hops>CurSrv->maxhops){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		UdmLog(Indexer,UDM_LOG_WARN,"Too many hops (%d)... deleted.",Doc->hops);
		FreeDoc(Doc);
		return(result);
	}

	/* Check Allow/Disallow/CheckOnly stuff */

	Method=UdmFindFilter(Indexer->Conf,Doc->url,reason);
	UdmLog(Indexer,UDM_LOG_DEBUG,"%s",reason);

	if(Method==UDM_DISALLOW){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		if((result==IND_OK)&&(!strcmp(CurURL.filename,"robots.txt"))){
			if(IND_OK==(result=UdmDeleteRobotsFromHost(Indexer,CurURL.hostinfo)))
				result=UdmLoadRobots(Indexer);
		}
		FreeDoc(Doc);
		return(result);
	}

	UdmParseURL(&realURL,Doc->url);
	/* Check for too many errors on this server */
	if((UdmGetServerErr(Indexer->Conf,realURL.hostname)>=CurSrv->max_net_errors)&&(CurSrv->max_net_errors)){
		UdmLog(Indexer,UDM_LOG_WARN,"Too many network errors for this server, skipped");
		result=UdmUpdateUrl(Indexer,Doc->url_id,504,CurSrv->net_error_delay_time);
	}else /* Check whether URL is disallowed by robots.txt */
	if(CurSrv->use_robots){
		UDM_ROBOT * R;
		
		if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_ROBOTS);
		R=UdmFindRobots(Indexer->Conf,&realURL);
		if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_ROBOTS);
		if(R){
			UdmLog(Indexer,UDM_LOG_WARN,"Deleting URL: robots.txt: '%s'",R->path);
			result=UdmDeleteUrl(Indexer,Doc->url_id);
		}
	}
	
	if(result){
		FreeDoc(Doc);
		return(result);
	}

	/* Start AliasProg if no alias in "Server" command */
#ifdef UDM_WITH_ALIASPROG
	if((Indexer->Conf->alias_prog)&&(!aliastr[0])){
		FILE * aprog;
		char cmd[UDM_URLSIZE*3];
		char * ares=NULL;
		char * args[1];

		args[0]=Doc->url;
		UdmBuildParamStr(cmd,sizeof(cmd),Indexer->Conf->alias_prog,args,1);
		aprog=popen(cmd,"r");
		UdmLog(Indexer,UDM_LOG_EXTRA,"Starting AliasProg: '%s'",cmd);
		if(aprog){
			ares=fgets(aliastr,sizeof(aliastr),aprog);
			pclose(aprog);
			if(!ares){
				UdmLog(Indexer,UDM_LOG_ERROR,"AliasProg didn't return result: '%s'",cmd);
				return(IND_ERROR);
			}
		}else{
			UdmLog(Indexer,UDM_LOG_ERROR,"Can't start AliasProg: '%s'",cmd);
			return(IND_ERROR);
		}
		if(ares[0]){
			ares+=strlen(ares);
			ares--;
			while((ares>=aliastr)&&strchr(" \r\n\t",*ares)){
				*ares='\0';
				ares--;
			}
		}
		UdmLog(Indexer,UDM_LOG_EXTRA,"AliasProg result: '%s'",aliastr);
	}
#endif

	/* Find alias when aliastr is empty, i.e.     */
	/* when there is no alias in "Server" command */
	/* and no AliasProg                           */
	if((!aliastr[0])&&(Alias=UdmFindAlias(Indexer->Conf,Doc->url))){
		sprintf(aliastr,"%s%s",Alias->replace,Doc->url+strlen(Alias->find));
	}

	/* Parse aliased URL */
	if(aliastr[0]){
		if(UdmParseURL(&realURL,aliastr)){
			UdmLog(Indexer,UDM_LOG_ERROR,"Error in aliased URL: '%s'",aliastr);
			FreeDoc(Doc);
			return(IND_OK);
		}else{
			UdmLog(Indexer,UDM_LOG_EXTRA,"Alias: '%s'",aliastr);
		}
	}

	/* Compose HTTP/1.0 request header */
	sprintf(request,"%s %s%s HTTP/1.0\r\n",
		(Method==UDM_HEAD)?"HEAD":"GET",
		CurSrv->proxy?Doc->url:realURL.path,
		CurSrv->proxy?"":realURL.filename);

	/* Add If-Modified-Since header */
	if((Doc->status)&&(Doc->last_mod_time)&&(!reindex)){
		UdmTime_t2HttpStr(Doc->last_mod_time, buf);
		sprintf(UDM_STREND(request),"If-Modified-Since: %s\r\n",
			buf);
	}

	/* HTTP and FTP specific stuff */
	if((!strcasecmp(realURL.schema,"http"))||(!strcasecmp(realURL.schema,"ftp"))){

		/* User agent */
		sprintf(UDM_STREND(request),"User-Agent: %s\r\n",Indexer->Conf->user_agent);

		/* If LocalCharset specified */
		if( Indexer->Conf->local_charset )
			sprintf(UDM_STREND(request),"Accept-charset: %s\r\n",
				UdmCharsetStr(Indexer->Conf->local_charset));

		/* Host Name for virtual hosts */
		sprintf(UDM_STREND(request),"Host: %s\r\n",realURL.hostname);

		/* Auth if required */
		if(CurSrv->basic_auth)
			sprintf(UDM_STREND(request),"Authorization: Basic %s\r\n",
				CurSrv->basic_auth);
		
		/* ProxyAuth if required */
		if(CurSrv->proxy_basic_auth)
			sprintf(UDM_STREND(request),"Proxy-Authorization: Basic %s\r\n",
				CurSrv->proxy_basic_auth);
	}
	/* Add user defined headers */
	strcat(request,Indexer->Conf->extra_headers);

	/* Empty line is the end of HTTP header */
	strcat(request,"\r\n");
#ifdef DEBUG_REQUEST
	fprintf(stderr,"Request:'%s'\n",request);
#endif
	size=UDM_NET_UNKNOWN;

	UDM_THREADINFO(Indexer->Conf->ThreadInfo,Indexer->handle,"Getting",Doc->url);

	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* If mirroring is enabled */
	if (CurSrv->use_mirror >= 0) {
		char errstr[UDMSTRSIZ]="";
		/* on u_m==0 it returned by mtime from mirrorget */
		/* but we knew that it should be put in mirror  */

		size = UdmMirrorGET(Indexer,
				CurSrv->mirror_root,CurSrv->mirror_headers,
				realURL.schema, realURL.hostname,
				realURL.path, realURL.filename,
				Indexer->buf,CurSrv->use_mirror,errstr);
		if(size>0){
			UdmLog(Indexer,UDM_LOG_DEBUG,"%s has been taken from mirror",Doc->url);
			found_in_mirror=1;
		}else{
			UdmLog(Indexer,UDM_LOG_DEBUG,"%s",errstr);
		}
	}

	/* Get it from the source  */
	/* if no mirror copy found */

	if(!found_in_mirror){
#ifdef UDM_WITH_EXEC_CGI
		if(!strcasecmp(realURL.schema,"exec")){
			size = UdmExecGet(Indexer,Doc,&realURL);
		}
		if(!strcasecmp(realURL.schema,"cgi")){
			size = UdmExecGet(Indexer,Doc,&realURL);
		}
#endif
#ifdef USE_HTDB
		if(!strcasecmp(realURL.schema,"htdb")){
			size = UdmHTDBGet(Indexer,realURL.path,realURL.filename,CurSrv->htdb_list,CurSrv->htdb_doc);
			if(UdmDBErrorCode(Indexer->db))result=IND_ERROR;
		}
#endif
#ifdef USE_FILE
		if(!strcasecmp(realURL.schema,"file")){
			size = UdmFILEGet(Indexer,request);
		}
#endif
#ifdef USE_NEWS
		if((!strcasecmp(realURL.schema,"news"))){
			size = UdmNEWSGet(Indexer,request,realURL.hostname,realURL.port?realURL.port:realURL.default_port);
		}else
		if((!strcasecmp(realURL.schema,"nntp"))){
			size = UdmNNTPGet(Indexer,request,realURL.hostname,realURL.port?realURL.port:realURL.default_port);
		}
#endif
#ifdef USE_HTTP
		if((!strcasecmp(realURL.schema,"http")) ||
		    ((!strcasecmp(realURL.schema,"ftp"))&&(CurSrv->proxy))){
			int get_doc=1;
#ifdef USE_MP3
			if (CurSrv->check_mp3_tag && CurURL.filename[0] !='\0'){
				int is_del=0, i;
				char range[16];
				UDM_HTTP_HDR *http_hdr=NULL;

				sprintf(range, "0-256");
				for( i=0; i<2; i++){
					char *range_request;
					
					http_hdr_free(http_hdr);
					http_hdr=NULL;

					Indexer->maxsize=10240; /* FIXME Why? */
					
					/* Build header with Range field */
					range_request=UdmXmalloc(strlen(request)+ 17 +strlen(range));
					strcpy(range_request,request);
					/* Clear last \r\n */
					range_request[strlen(range_request)-2]='\0';
					sprintf(UDM_STREND(range_request), "Range: bytes=%s\r\n\r\n", range);
					
					/* Get it ! */
					size = UdmHTTPGet(Indexer,range_request,
					    CurSrv->proxy?CurSrv->proxy:realURL.hostname,
					    CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:realURL.default_port));

					UDM_FREE(range_request);
					if(size==-1)break;
					
					http_hdr = http_parse_header(Indexer->buf);
					if (http_hdr && http_hdr->status==206){
					        if ((Doc->content = get_id3_tag(http_hdr->body_ptr,
									size-http_hdr->hdr_size))==NULL){
							if (i==0)
								sprintf(range, "-128");
							else{
							/* It's mp3 file but no ID3 tag here */
								is_mp3=1; get_doc=0; is_del=0;
								break;
							}
						}else if (Doc->content == (char*)-1){
							if (i==1){
							/* It's mp3 file but no ID3 tag here */
								is_mp3=1; get_doc=0; is_del=0;
								break;
							}else{
							/* Not mp3 file */
								is_mp3=0; get_doc=1; is_del=1;
								break;
							}
						}else{
							/* It's mp3 file and id3 tag here */
							is_mp3=1; get_doc=0; is_del=0;
						}
					}else if (http_hdr && http_hdr->status==200){
						/* Fix me: Range don't work why? */
						is_mp3=0; get_doc=1; is_del=0;
						break;
					}else{
						/* Some http error */
						is_mp3=0; get_doc=0; is_del=0;
						break;
					}
				}

				if(is_mp3){
					size_t remain_len;
					mp3_size=http_hdr->content_range;
					
					/* Store original content-type */
					if(http_hdr->content_type){
						UdmDBEscStr(Indexer->Conf->DBType,content_type_escaped,http_hdr->content_type);
					}
					memcpy(http_hdr->body_ptr-2, "Content-Type: text/html\r\n\r\n", 27);
					http_hdr->body_ptr += 25;
					remain_len=Indexer->Conf->max_doc_size-http_hdr->hdr_size;
					if(Doc->content && Doc->content != (char*)-1){
						strncpy(http_hdr->body_ptr, Doc->content,remain_len<400?remain_len:400);
						UDM_FREE(Doc->content);
					}else{
						http_hdr->body_ptr[0]='\0';
					}
					size = strlen(Indexer->buf);
				}
				http_hdr_free(http_hdr);
				http_hdr=NULL;
				/*
				if (CurSrv->check_only_mp3_tag && !is_mp3 && is_del){
					result=UdmDeleteUrl(Indexer,Doc->url_id);
					UdmLog(Indexer,UDM_LOG_WARN,"No ID3 tag detected in %s... deleted.",realURL.specific);
					FreeDoc(Doc);
					return(result);
				}
				*/
			}	
#endif
			if (get_doc){
				size = UdmHTTPGet(Indexer,request,
					CurSrv->proxy?CurSrv->proxy:realURL.hostname,
					CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:realURL.default_port));
			}
		}
#endif
#ifdef USE_FTP
		if ((!strcasecmp(CurURL.schema,"ftp"))&&(!CurSrv->proxy)){
		    size = UdmFTPGet(Indexer,realURL.hostname,realURL.port?realURL.port:realURL.default_port,
			CurURL.path, CurURL.filename[0]=='\0'?NULL:CurURL.filename, 
			CurSrv->user,CurSrv->passwd,Doc->last_mod_time,
			(Method==UDM_HEAD?1:0));
		}
#endif
	}

	if(!result) /* Extra check for HTDB possible bad result */
	switch(size){
	case UDM_NET_UNKNOWN:
		UdmLog(Indexer,UDM_LOG_WARN,"Protocol not supported");
		status=UDM_HTTP_STATUS_NOT_SUPPORTED;
		result=IND_OK;
		break;
	case UDM_NET_ERROR:
		UdmLog(Indexer,UDM_LOG_WARN,"Network error");
		status=UDM_HTTP_STATUS_TIMEOUT;
		result=IND_OK;
		break;
	case UDM_NET_TIMEOUT:
		UdmLog(Indexer,UDM_LOG_WARN,"Download timeout");
		status=UDM_HTTP_STATUS_TIMEOUT;
		result=IND_OK;
		break;
	case UDM_NET_CANT_CONNECT:
		UdmLog(Indexer,UDM_LOG_WARN,"Can't connect to host %s:%d",
		CurSrv->proxy?CurSrv->proxy:realURL.hostname,
		CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:realURL.default_port));
		status=UDM_HTTP_STATUS_UNAVAIL;
		result=IND_OK;
		break;
	case UDM_NET_CANT_RESOLVE:
		UdmLog(Indexer,UDM_LOG_WARN,"Unknown %shost '%s'",
			CurSrv->proxy?"proxy ":"",
			CurSrv->proxy?CurSrv->proxy:realURL.hostname);
		status=UDM_HTTP_STATUS_UNAVAIL;
		result=IND_OK;
		break;
	default:
		if(size<0){	/* No connection */
			UdmLog(Indexer,UDM_LOG_WARN,"Can't connect to host %s:%d",
				CurSrv->proxy?CurSrv->proxy:realURL.hostname,
				CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:realURL.default_port));
			status=UDM_HTTP_STATUS_UNAVAIL;
			result=IND_OK;
		}else{
			/* Document has been successfully loaded */
			/* Cut HTTP response header first        */
			{
				char *content_new, *content_old;
				content_new=strstr(Indexer->buf,"\r\n\r\n");
				content_old=strstr(Indexer->buf,"\n\n");
				if ((content_new) && (content_old)) {
				  if ( (content_old-Indexer->buf) < (content_new-Indexer->buf) ) {
				    Doc->content = content_old;
				    Doc->content[0]='\0';
				    Doc->content += 2;
				  } else {
				    Doc->content= content_new;
				    Doc->content[0]='\0';
				    Doc->content += 4;
				  }
				} else if (content_new) {
				  Doc->content = content_new;
				  Doc->content[0]='\0';
				  Doc->content += 4;
				} else if (content_old) {
				  Doc->content = content_old;
				  Doc->content[0]='\0';
				  Doc->content += 2;
				} else Doc->content=content_new;
			}
			if(!Doc->content){
				UdmLog(Indexer,UDM_LOG_ERROR,"Illegal HTTP headers in response");
				status=UDM_HTTP_STATUS_UNAVAIL;
				result=IND_OK;
			}
		}
	}
	/* Check again whether "terminate" was pressed */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* Exit if there was an error while downloading */
	if(result){
		UdmAddServerErr(Indexer->Conf,realURL.hostname);
		if(result!=IND_ERROR)
			result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->net_error_delay_time);
		FreeDoc(Doc);
		return(result);
	}

	/* Let's start parsing */
	Indexer->buf[size]=0;header=Indexer->buf;
	realsize=size;
	status=changed=1;origin=0;
	content_type=location=statusline=NULL;
	Indexer->charset=UDM_CHARSET_USASCII;
	crc32=0;
	
	subj[0]=0;last_mod_time=0;from[0]=0;
	last_mod_time=Doc->last_mod_time;

	do_index=CurSrv->index;
	follow	=CurSrv->follow;

	size-=(Doc->content-Indexer->buf);     /* Could be modified by Content-Length   */
	realsize-=(Doc->content-Indexer->buf); /* Will be safe for crc32, Parsers etc   */

	UDM_THREADINFO(Indexer->Conf->ThreadInfo,Indexer->handle,"Parsing",Doc->url);
	/* Now lets parse response header lines */
	hcopy = strdup(header);
	tok=UdmGetToken(header,"\r\n",&lt);
	while(tok){
		if(!UDM_STRNCASECMP(tok,"HTTP/")){
			status=atoi(tok+8);
			statusline=tok;
		}else
		if ((Indexer->Conf->force1251) && (!UDM_STRNCASECMP(tok,"Server: "))){
			char * sname;
			sname=UdmTrim(tok+7," ");
			if (!UDM_STRNCASECMP(sname,"Microsoft")||
				!UDM_STRNCASECMP(sname,"IIS"))
					Indexer->charset=UDM_CHARSET_CP1251;
		}else
		if(!UDM_STRNCASECMP(tok,"Content-Type:")){
			if (!Indexer->Conf->use_remote_cont_type) {
				content_type=UdmContentType(Indexer->Conf,Doc->url);
			}
			
			if ((Indexer->Conf->use_remote_cont_type) || (content_type == NULL)) {
				char *p;
				
				/* This is hack against bad servers */
				/* which send Content-Type:xxxx     */
				/* instead of Content-type: xxxx    */
				for(tok+=13;*tok==' ';tok++);
				content_type=tok;
				if((p=strstr(content_type,"charset=")))
					Indexer->charset = UdmGetCharset(p + 8);
			}
			
			if ((content_type!=NULL)&&(!content_type_escaped[0])) {
				/* Store content_type right now  */
				/* It can be modified  after     */
				/* possible external parser call */
				UdmDBEscStr(Indexer->Conf->DBType,content_type_escaped,content_type);
			}
		}else
		if(!UDM_STRNCASECMP(tok,"Location: ")){
			location=tok+10;
		}else
		if(!UDM_STRNCASECMP(tok,"Content-Length: ")){
			size=atoi(tok+16);
		}else
		if(!UDM_STRNCASECMP(tok,"Subject: ")){
			char title[UDM_MAXTITLESIZE];
#ifdef NEWS_EXT
			char hs[UDM_MAXSUBJSIZE];
			UDM_STRNCPY(hs,tok+9);
			udm_rfc1522_decode(HeaderSubject,hs);
#endif
			UDM_STRNCPY(title,tok+9);
			udm_rfc1522_decode(subj,title);
			Doc->title=strdup(subj); /* FIXME possible memory leak*/
		}else
		if(!UDM_STRNCASECMP(tok,"From: ")){
			char fromh[UDM_MAXKEYWORDSIZE];
#ifdef NEWS_EXT
			char hf[UDM_MAXFROMSIZE];
			UDM_STRNCPY(hf,tok+6);
			udm_rfc1522_decode(HeaderFrom,hf);
#endif
			UDM_STRNCPY(fromh,tok+6);
			udm_rfc1522_decode(from,fromh);
			
		}else
		if(!UDM_STRNCASECMP(tok,"Newsgroups: ")){
			char keywords[UDM_MAXKEYWORDSIZE];
			UDM_STRNCPY(keywords,tok+12);
			Doc->keywords=strdup(keywords); /* FIXME possible memory leak*/
#ifdef NEWS_EXT
			strncpy(HeaderGroup,tok+12,UDM_MAXGROUPSIZE);
#endif
		}else
		if(!UDM_STRNCASECMP(tok,"Date: ")){
			last_mod_time=UdmHttpDate2Time_t(tok+6);
#ifdef NEWS_EXT
			strncpy(HeaderDate,tok+6,UDM_MAXDATESIZE);
#endif
		}else
#ifdef NEWS_EXT
		if(!UDM_STRNCASECMP(tok,"References: ")){
			strncpy(HeaderRefs,tok+12,UDM_MAXREFSIZE);
		}else
#endif
		if(!UDM_STRNCASECMP(tok,"Last-Modified: ")){
			last_mod_time=UdmHttpDate2Time_t(tok+15);
		}
		UdmLog(Indexer,UDM_LOG_DEBUG,"%s",tok);
		tok=UdmGetToken(NULL,"\r\n",&lt);
	}
	UdmLog(Indexer,UDM_LOG_EXTRA,"%s %s %d",statusline?statusline:"?",content_type?content_type:"?",size);


#ifdef NEWS_EXT
	/* do correct string termination */
	HeaderDate[UDM_MAXDATESIZE-1] = 0;
	HeaderFrom[UDM_MAXFROMSIZE-1] = 0;
	HeaderSubject[UDM_MAXSUBJSIZE-1] = 0;
	HeaderGroup[UDM_MAXGROUPSIZE-1] = 0;
	HeaderRefs[UDM_MAXREFSIZE-1] = 0;
	if((SQLDate=UdmParseDate(HeaderDate))){
		UdmDBEscStr(Indexer->Conf->DBType,HeaderDateEsc,SQLDate);
		free(SQLDate);
	}
	/* Escape Headers */
	UdmDBEscStr(Indexer->Conf->DBType,HeaderFromEsc,HeaderFrom);
	UdmDBEscStr(Indexer->Conf->DBType,HeaderSubjectEsc,HeaderSubject);
	UdmDBEscStr(Indexer->Conf->DBType,HeaderGroupEsc,HeaderGroup);
	UdmDBEscStr(Indexer->Conf->DBType,HeaderRefsEsc,HeaderRefs);
#endif

	switch(UdmHTTPResponseType(status)){
	case 1: /* No HTTP code */
		UdmAddServerErr(Indexer->Conf,realURL.hostname);
		UdmLog(Indexer,UDM_LOG_ERROR,"No HTTP response status");
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->net_error_delay_time);
		break;

	case UDM_HTTP_STATUS_OK:
	case UDM_HTTP_STATUS_PARTIAL_OK:
		if(!content_type){
			UdmLog(Indexer,UDM_LOG_ERROR,"No Content-type in '%s'!",Doc->url);
	                UdmAddServerErr(Indexer->Conf,realURL.hostname);
			result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->net_error_delay_time);
		}
		break;

	case UDM_HTTP_STATUS_REDIRECT: /* We'll try to use Location: xxx instead */

		if((CurSrv->follow!=UDM_FOLLOW_NO)&&(Doc->hops<CurSrv->maxhops)&&(location)){
			int newMethod;
			newMethod=UdmFindFilter(Indexer->Conf,location,reason);
			UdmLog(Indexer,UDM_LOG_DEBUG,"\"%s\" : '%s'",location,reason);
			if(newMethod!=UDM_DISALLOW){
				UDM_URL	newURL;
				int res;
				if(!(res=UdmParseURL(&newURL,location))){
					if(UdmFindServer(Indexer->Conf,location,NULL)){
						if((!CurSrv->delete_no_server)&&(CurSrv->follow!=UDM_FOLLOW_WORLD)){
							/* compare hostinfo in some cases */
							if(!strcmp(CurURL.hostinfo,newURL.hostinfo)){
								if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
								if(UdmAddHref(Indexer->Conf,location,Doc->url_id,Doc->hops+1,0,CurSrv->tag,CurSrv->category))
									Indexer->Conf->have_targets=1;
								if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
							}
						}else{
							if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
							if(UdmAddHref(Indexer->Conf,location,Doc->url_id,Doc->hops+1,0,CurSrv->tag,CurSrv->category))
								Indexer->Conf->have_targets=1;
							if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
						}
					}else{
						UdmLog(Indexer,UDM_LOG_DEBUG,"\"%s\" : no Server command",location);
					}
				}else{
					switch(res){
					case UDM_PARSEURL_LONG:
						UdmLog(Indexer,UDM_LOG_ERROR,"Redirect URL too long: '%s'",location);
						break;
					case UDM_PARSEURL_BAD:
					default:
						UdmLog(Indexer,UDM_LOG_ERROR,"Error in redirect URL: '%s'",location);
					}
				}
			}
		}
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	case UDM_HTTP_STATUS_NOT_MODIFIED:  /* Not Modified, nothing to do */
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	case UDM_HTTP_STATUS_DELETE:
		/* delete it if not robots.txt */
		UdmLog(Indexer,UDM_LOG_EXTRA,"Deleting URL");
		if(!strcmp(CurURL.filename,"robots.txt")){
			result=UdmDeleteRobotsFromHost(Indexer,CurURL.hostinfo);
			if(result==IND_OK)result=UdmLoadRobots(Indexer);
		}
		if(result!=IND_ERROR){
			if(CurSrv->deletebad){
				result=UdmDeleteUrl(Indexer,Doc->url_id);
			}else{
				result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
			}
		}
		break;

	case UDM_HTTP_STATUS_RETRY: /* We'll retry later, maybe host is down */
                UdmAddServerErr(Indexer->Conf,realURL.hostname);
		UdmLog(Indexer,UDM_LOG_EXTRA,"Could not read URL");
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->net_error_delay_time);
		break;

	default: /* Unknown status, retry later */
		UdmLog(Indexer,UDM_LOG_WARN,"HTTP %d We don't yet know how to handle it, skipped",status);
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
	}

	/* Check again whether "terminate" pressed */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* Return if Content parsing is not required */
	if(result){
		FreeDoc(Doc);
		UDM_FREE(hcopy);
		return(result);
	}

	/* Now we have HTTP_OK and know Content-Type */
	/* Lets start parse the body                 */

	/* Put into mirror if required */
	if ((CurSrv->use_mirror>=0)&&(!found_in_mirror)){
		char errstr[UDMSTRSIZ]="";
		/* I tried to get it from mirror, but nothing was there  */
		/* or it was expired. Now we will store it on the mirror */
		if(UdmMirrorPUT(Indexer,
				CurSrv->mirror_root,CurSrv->mirror_headers,
				realURL.schema, realURL.hostname,
				realURL.path, realURL.filename,
				hcopy, Doc->content, size,errstr)){
			UdmLog(Indexer,UDM_LOG_DEBUG,"%s",errstr);
		}
		
	}
	UDM_FREE(hcopy);


#ifdef USE_PARSER
	/* Let's try to start external parser for this Content-Type */
	if((mime=UdmExecParser(Indexer,content_type,&mimeno,Doc->content,(size_t)realsize,(size_t)(Indexer->Conf->max_doc_size-(Doc->content-Indexer->buf)),Doc->url))){
		char * to_charset;
		Doc->content=mime;
		content_type=Indexer->Conf->parsers[mimeno].to_mime;
		UdmLog(Indexer,UDM_LOG_DEBUG,"Parser-Content-Type: %s",
			content_type?content_type:"unknown");

		if((to_charset=strstr(Indexer->Conf->parsers[mimeno].to_mime,"charset="))){
			Indexer->charset=UdmGetCharset(to_charset+8);
			fprintf(stderr,"to_charset='%s'\n",to_charset+8);
		}
#ifdef DEBUG_PARSER
		fprintf(stderr,"content='%s'\n",Doc->content);
#endif
	};
#endif

	/* robots.txt */
	if(!UDM_STRNCASECMP(content_type,"text/plain")&&
	(!strcmp(CurURL.filename,"robots.txt"))){
		if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_ROBOTS);
		result=UdmParseRobots(Indexer,&Indexer->Conf->Robots,Doc->content,CurURL.hostinfo);
		if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_ROBOTS);

		if(result!=IND_ERROR)result=UdmLoadRobots(Indexer);
		if(result==IND_ERROR){
			FreeDoc(Doc);
			return(IND_ERROR);
		}
	}else

	/* plain TEXT or the same */
	if(!UDM_STRNCASECMP(content_type,"text/plain")||
		!UDM_STRNCASECMP(content_type,"text/tab-separated-values")||
		!UDM_STRNCASECMP(content_type,"text/css"))
	{
		if(Indexer->charset!=Indexer->Conf->local_charset){
			/* Set charset from CurSrv configuration */
			/* if it was not detected before         */
			if(!Indexer->charset){
				Indexer->charset=UdmGetCharset(CurSrv->charset);
			}
			UdmRecode(Doc->content,Indexer->charset,Indexer->Conf->local_charset);
		} else {
			int DCindex=0;
#ifdef USE_CHARSET_GUESSER
			DCindex = UdmGuessCharset(Doc->content, UdmGetCharset(CurSrv->charset));
#else
			DCindex = UdmGetCharset(CurSrv->charset);
#endif
			UdmRecode(Doc->content,DCindex,Indexer->Conf->local_charset);
		}
		if(Method!=UDM_HEAD){
			crc32=UdmCRC32(Doc->content, (size_t)realsize);
			changed=!(crc32==Doc->crc32);
			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}
		if(((do_index)&&(!origin)&&(changed))||reindex){
			if(!CurSrv->check_only_mp3_tag){
				char text[UDM_MAXTEXTSIZE];
				strncpy(text,Doc->content,UDM_MAXTEXTSIZE-2);
				text[UDM_MAXTEXTSIZE-1]=0;
				Doc->text=strdup(text);
				UdmParseText(Indexer,CurSrv,Doc->content,CurSrv->bodyweight,1);
			}
		}
	}else

	/* HTML text */
	if(!UDM_STRNCASECMP(content_type,"text/html")){

		if(Method!=UDM_HEAD){
			crc32=UdmCRC32(Doc->content, (size_t)realsize);
			changed=!(crc32==Doc->crc32);
			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}

		if(((do_index||(follow!=UDM_FOLLOW_NO))&&(!origin)&&(changed))||reindex){
		        Indexer->nlangs = 0;
			UdmSelectLang(Indexer, CurSrv->lang);
			UdmParseHtml(Indexer,CurSrv,&CurURL,Doc);
		}
	}else{
		/* Unknown Content-Type */
		if(Method!=UDM_HEAD){
			crc32=UdmCRC32(Doc->content, (size_t)realsize);
			changed=!(crc32==Doc->crc32);
			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}
	}

	if (strcmp(CurURL.filename,"robots.txt")||is_mp3||!CurSrv->check_only_mp3_tag){
		char str[UDMSTRSIZ];
		if(CurSrv->urlweight) {
			UdmUnescapeCGIQuery(str,Doc->url);
			UdmParseText(Indexer,CurSrv,str,CurSrv->urlweight,1);
		} else {
			if(CurSrv->urlhostweight) {
				strcpy(str,CurURL.hostname);
				UdmParseText(Indexer,CurSrv,str,CurSrv->urlhostweight,1);	
			}
			if(CurSrv->urlpathweight) {
				UdmUnescapeCGIQuery(str,CurURL.path);
				UdmParseText(Indexer,CurSrv,str,CurSrv->urlpathweight,1);
			}
			if(CurSrv->urlfileweight) {
				UdmUnescapeCGIQuery(str,CurURL.filename);
				UdmParseText(Indexer,CurSrv,str,CurSrv->urlfileweight,1);
			}
		}   
	}
	
	
	UDM_THREADINFO(Indexer->Conf->ThreadInfo,Indexer->handle,"Updating",Doc->url);
	if(origin){
		UdmLog(Indexer,UDM_LOG_EXTRA,"Duplicate Document with #%d",origin);
		result=UdmDeleteWordFromURL(Indexer,Doc->url_id);
		if((Indexer->Conf->use_crossword)&&(Indexer->Conf->DBMode!=UDM_DBMODE_CACHE)){
			if(result==IND_OK)result=UdmDeleteCrossWordFromURL(Indexer,0,Doc->url_id);
			if(result==IND_OK)result=UdmDeleteCrossWordFromURL(Indexer,Doc->url_id,0);
		}
		if(result==IND_OK)result=UdmUpdateClone(Indexer,Doc->url_id,status,CurSrv->period,content_type_escaped,last_mod_time,crc32);
	}else
	if((!changed)&&(!reindex)){
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
	}else{
		int site_id;
		char site_id_str[UDM_URLSIZE];
		
		if(subj[0]){
			char str[UDMSTRSIZ];
			strcpy(str,subj);
			UdmParseText(Indexer,CurSrv,str,CurSrv->titleweight,1);
		}
		sprintf(site_id_str,"%s://%s/",CurURL.schema,CurURL.hostinfo);
		site_id=UdmStrCRC32(site_id_str);
#ifdef DEBUG_SITE_ID
		fprintf(stderr,"SIDE_ID '%s' -> %08X\n",site_id_str,site_id);
#endif
		
		if(result!=IND_ERROR){
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			if(UdmStoreHrefs(Indexer))
				Indexer->Conf->have_targets=1;
			if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			if(UdmDBErrorCode(Indexer->db))
				result=IND_ERROR;
			else
				result=IND_OK;
		}	
		if(result!=IND_ERROR){
			if((!CurSrv->check_only_mp3_tag)||(is_mp3)){
				if(result!=IND_ERROR)result=UdmStoreWords(Indexer,Doc->url_id,site_id,CurSrv->category?CurSrv->category:"",CurSrv->tag?CurSrv->tag:"",Doc->status);
				/* Store URLs from cache into database */
				/* This is for StoreCrossWords here    */
				if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_LOCK,UDM_LOCK_TARGET);
				if(UdmStoreHrefs(Indexer))Indexer->Conf->have_targets=1;
				if(UdmDBErrorCode(Indexer->db))result=IND_ERROR;
				if(Indexer->Conf->LockProc)Indexer->Conf->LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
				if(Indexer->Conf->use_crossword&&Indexer->Conf->DBMode!=UDM_DBMODE_CACHE){
					if(result!=IND_ERROR)result=UdmStoreCrossWords(Indexer,Doc->url_id);
				}
			}
		}
		if(result!=IND_ERROR){
			int total,i;
			char str[UDMSTRSIZ]="";
			char lang[3]="";
			char *s;

			/* Prepare text,keywords,description */
			if(!strcasecmp(realURL.schema,"news"))
				Doc->keywords=strdup(from);

			if((s=Doc->text)){
#ifndef NEWS_EXT			
				while(*s){
					if(strchr("\t\n\r",*s))*s=' ';s++;
				}
#endif
				if(strlen(Doc->text)>=UDM_MAXTEXTSIZE)
					Doc->text[UDM_MAXTEXTSIZE-1]=0;
				UdmDBEscStr(Indexer->Conf->DBType,text_escaped,Doc->text);
			}

			if((s=Doc->keywords)){
				while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
				if(strlen(Doc->keywords)>=UDM_MAXKEYWORDSIZE)
					Doc->keywords[UDM_MAXKEYWORDSIZE-1]=0;
				UdmDBEscStr(Indexer->Conf->DBType,keywords_escaped,Doc->keywords);
			}

			if((s=Doc->description)){
				while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
				if(strlen(Doc->description)>=UDM_MAXDESCSIZE)
					Doc->description[UDM_MAXDESCSIZE-1]=0;
				UdmDBEscStr(Indexer->Conf->DBType,descript_escaped,Doc->description);
			}

			if((s=Doc->title)){
				while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
				if(strlen(Doc->title)>=UDM_MAXTITLESIZE)
					Doc->title[UDM_MAXTITLESIZE-1]=0;
				UdmDBEscStr(Indexer->Conf->DBType,title_escaped,Doc->title);
			}

			if (strcmp(CurSrv->lang,"")) {
				strncpy(lang,CurSrv->lang,2);
			}

			/* Guess the language */
			qsort((void*)(Indexer->lang+1),UDM_LANGPERDOC-1,sizeof(UDM_LANG),cmplang);
			total=0;
			for(i=1;i<UDM_LANGPERDOC;i++)
				total+=Indexer->lang[i].count;
			if(total){
				sprintf(str,"total: %d",total);
				for(i=1;i<UDM_LANGPERDOC;i++){
					if(Indexer->lang[i].count)
						sprintf(UDM_STREND(str)," %s:%d (%d%%)",
						Indexer->lang[i].lang,
						Indexer->lang[i].count,
						(int)(Indexer->lang[i].count*100/total));
				}
				UdmLog(Indexer,UDM_LOG_DEBUG,"language: %s",str);
				if(Indexer->lang[1].count>5){
					strncpy(lang,Indexer->lang[1].lang,2);
					lang[2]=0;
				} 
			}
#ifdef NEWS_EXT
			/*
			extract message id from url
			thanks guys, the message id is in realURL.filename !!
			valid message ids have an @ character
			*/

			if(strchr(realURL.filename,'@'))
				UdmDBEscStr(Indexer->Conf->DBType,MessageIdEsc,realURL.filename);
						
			/* get rec_id from my parent out of db (if I have one...) */

			if(strlen(HeaderRefs))
			{
				/* HeaderRefs contains all message ids of my predecessors, space separated*/
				/* my direct parent is the last in the list*/
				if((parent = strrchr(HeaderRefs,' ')))
				{	
					/* parent now points to the space character */
					/* skip it */
					++parent;
				}
				else
				{
					/* there is only one entry in HeaderRefs, so this is my parent */
					parent=HeaderRefs;	
				}	
			}
			result = 0;
			/* if the parent is really a message id, */
			/* get its rec_id from database          */
			if(parent && strlen(parent) && strchr(parent,'@'))
				result = UdmFindMsgID(Indexer,parent);	
			/*
			now register me with my parent
			result is -1 if no parent was found
			*/
#ifdef HEIKODEBUG
			fprintf(stderr,"%s, me: %d, parent: %d\n",parent,Doc->url_id,result);
#endif
			if(result > 0)
				result = UdmRegisterChild(Indexer,result,Doc->url_id);

			result=UdmLongUpdateUrl(Indexer,Doc->url_id,status,
				changed,size,CurSrv->period,
				CurSrv->tag?CurSrv->tag:"",
				last_mod_time,
				text_escaped,title_escaped,
				content_type_escaped,keywords_escaped,
				descript_escaped,crc32,lang,
				CurSrv->category?CurSrv->category:"",
				HeaderDateEsc,
				HeaderSubjectEsc,
				HeaderFromEsc,
				HeaderGroupEsc,
				HeaderRefsEsc,
				MessageIdEsc);
#else
			result=UdmLongUpdateUrl(Indexer,Doc->url_id,status,
				changed,is_mp3?mp3_size:size,CurSrv->period,
				CurSrv->tag?CurSrv->tag:emptystr,last_mod_time,
				text_escaped,title_escaped,
				content_type_escaped,keywords_escaped,
				descript_escaped,crc32,lang,
				CurSrv->category?CurSrv->category:emptystr);
#endif			
		}
	}
	UdmFreeWords(Indexer);
	UdmFreeCrossWords(Indexer);
	FreeDoc(Doc);
	return(result);
}


/************ Misc functions *****************************/



__INDLIB__ int UdmClearDatabase(UDM_AGENT * Indexer){
	return(UdmClearDB(Indexer));
}


__INDLIB__ void UdmFreeResult(UDM_RESULT * Res){
	size_t i;
	
	if(Res){
		if(Res->Doc){
			for(i=0;i<Res->num_rows;i++){
				UDM_FREE(Res->Doc[i].url);
				UDM_FREE(Res->Doc[i].content_type);
				UDM_FREE(Res->Doc[i].title);
				UDM_FREE(Res->Doc[i].text);
				UDM_FREE(Res->Doc[i].keywords);
				UDM_FREE(Res->Doc[i].description);
				UDM_FREE(Res->Doc[i].category);
			}
			UDM_FREE(Res->Doc);
		}
		UDM_FREE(Res);
	}
	return;
}
