#!/usr/local/bin/perl
#
# $Header: /home/vikas/src/nocol/perlnocol/RCS/hostmon,v 1.10 1999/10/31 17:01:08 vikas Exp $
#
# Server module for 'hostmon' - monitoring Unix host conditions using NOCOL.
# Connects/gets data from the hostmon-client programs (via telnet or rcp)
# and dumps into NOCOL format.
#
# You need to run the client modules (hostmon-client) on the remote hosts
# being monitored and add the name of the host that runs this server
# program to the '@permithosts' array in the clients.
#
# AUTHOR:  Vikas Aggarwal, vikas@navya.com
#
#	Copyright 1994 Vikas Aggarwal, vikas@navya.com
#
# No warranty is expressed or implied. Permission to copy and use is
# extended to all. Permission to redistribute is granted under the
# following conditions: it is not sold for profit; this copyright
# notice remains intact; the same permissions extend to the recipient;
# and if any changes are made, a notice is added so stating.

#####################
#
# Command Format:
#
#  hostmon
#
#    Automatically kills old process and forks a new one, reading
#    the configuration file in the process.
#
# What it does:
#
#    hostmon is the 'master' server that periodically connects to the
#    hostmon_port of all the Unix machines listed in its configuration
#    file and compares the variable values returned with the thresholds
#    for the variables listed in its config file. It then sets the
#    severity of the variable (NOCOL style) and dumps the output to the
#    NOCOL data-file.
#
#    It can monitor practically *any* variable that has a numeric value.
#    Thresholds can be increasing or decreasing (accordingly the event
#    is flagged if the value exceeds or drops past the thresholds).
#
# Caveat:
#    If multiple variable values exist at any level (e.g. df output is
#    critical for 3 disks for a host), only the first one is indicated
#    (there is no way to list the value of df for all three disks in
#    NOCOL).
#
## Structures used in the program:
#    @thress[] => array of var, host, thresholds read in from config file
#    %thresindex{var} => assoc array of indexes into the @thress array
#    %curvar{$host} =>  var's current  status, value, thres, maxseverity,
#			address, unit
#    %prevdatatime{$host} =>  time of previous data per host
#    %vardatalines => number of lines per variable read in current hosts data
#			Used to extract the worst data value (most critical).
#    %isknownvar{$var} => quick boolean to see if variable is known.
#

#
##
#
# Customization:
#
#	Edit '$ping' and 'subroutine dping' for properly 'pinging' a site.
#	Also does an 'rpcping' (under nocol/support/), so install this
#	program or else change the &rpcping routine to always return '1'.
#
#	Put the IP address/name of the host running this program in the
#	hostmon-client.daemon program (in the @permithosts array). The
#	client only allows data retrieval via telnet from the hosts listed
#	in the @permithosts structure.
#######

############################
## Variables customization #  overrides values in the nocollib.pl library
############################
local ($progvar) = "HostmonData" ;	# Indicator if hostmon-data is good.
local ($TMPDATADIR) = "/tmp/hostmon";	# All host data files under here.

#########################################
$debug = 0;
$libdebug = 0;			# toggles on getting SIGUSR1 signal

# $nocolroot = "./";		# for testing/debugging
require  "nocollib.pl";

## the following variables can actually be set in nocollib.pl or changed here.
#
	
# Following service name is used if set in /etc/services file
$HOSTMON_SERVICE = "hostmon" unless $HOSTMON_SERVICE ;
$HOSTMON_PORT = 5355 unless $HOSTMON_PORT; # default port if not in services

				# Check ping and tweak syntax in sub 'doping()'
if (!$ping) {$ping = `which ping`; chop $ping;}  # SET_THIS
$rpcping = "rpcping" unless $rpcping ;		# set in nocollib.pl

$RCP = "rcp";		# or scp
#$RCP = "scp -q -B";	# scp in quiet & batch mode

    # The sleeptime needs to be larger than the hostmon.client's so that
    # it gets updated fresh data each pass.
$sleeptime=(60*15);		# default sleeptime of 15minutes

#########################################

-d $TMPDATADIR || mkdir($TMPDATADIR, 0700) || die("Cannot create $TMPDATADIR");
-w $TMPDATADIR || die("Cannot write to $TMPDATADIR");
-x $rpcping || die("Cannot find $rpcping") ;
-x $ping || die("Cannot find $ping");


$prognm = $0 ;			# save program name
select (STDERR); $| = 1;
select (STDOUT); $| = 1 ;	# set unbuffered

local ($rsock) = (-1);		# remote host's telnet socket

##
# ping a host to check if it is up and running. Might need some
# tweaking to adjust for different 'ping' styles on different systems.
# Return 0 if down, 1 if up.
# Can handle the following syntaxes so far:
#     ping host pktsize count           # HPUX & Ultrix
#     ping -s host pktsize count        # SunOS & Solaris
#     ping -c count -s pktsize host     # all others ?
sub doping {
    local ($rhost) = @_ ;
    local ($ping) = "ping" unless $ping;
    local ($value) = 0 ;	          # 1 for up, 0 for down

    $ostype= `uname -s -r -m`  unless $ostype;	# OS, revision, arch

    # PING output= 4 packets transmitted, 3 packets received, 25% packet loss
    if ($ping =~ /multiping/) {
	open(CMD, "$ping -s 100 -c 4 $rhost |");
    }
    elsif ($ostype =~ /HP-UX/ || $ostype =~ /ULTRIX/) {
	open(CMD, "$ping $rhost 100 4 |");
    }
    elsif ($ostype =~ /SunOS\s+4/ || $ostype =~ /SunOS\s+5/) {
	open(CMD, "$ping -s $rhost 100 4 |");
    }
    else {
	open (CMD, "$ping -s 100 -c 4 $rhost |");
    }
    while (<CMD>) {
	if ( /\s+(\d+)%\s+packet\s+loss/) { 
	    if ($1 < 50) { $value = 1; } # if 1 lost, then 25%
	    last;
	}
    }		# end: while(CMD)
    close (CMD);

    $debug && print STDERR "(dbg) doping return for $rhost =$value\n" ;
    return ($value);

}	# end doping()


## Check if the host is reachable (the RPC portmapper)
##
sub rpcping {
    local ($rhost) = @_ ;
    local ($rpcping) = "rpcping" unless $rpcping;
    local ($value) = 0 ;	          # 1 for up, 0 for down

    open (CMD, "$rpcping -t 5 $rhost |");
    while (<CMD>) {
	if (/running/) { $value = 1 ; last ; }
    }
    close (CMD);

    $debug && print STDERR "(dbg) rpcping return for $rhost =$value\n" ;
    return ($value);

}	# end rpcping()


## read configuration file
##
#   POLLINTERVAL & STARTHOSTS are keywords.
#   <variable>  <host regex>  <warn-thres> <err-thres> <crit-thres> <comment>
#
sub readconf {
    local ($starthosts, $i) = (0, 0);
    $numknownvars = 0 ;

    $debug && print STDERR "Config file= $cfile\n" ;
    open (CONFIG, "< $cfile") || 
	die ("Couldn't open config $cfile, exiting");
    while (<CONFIG>)
    {
	chop;
	if( /^\s*#/ || /^\s*$/ ) {next;} # skip comments & blank lines

	if ( !$starthosts && /^STARTHOSTS/ ) { 
	    $starthosts = 1; next ;  # all other lines are hostnames
	}
	
	if ($starthosts) {
	    if ( /^\s*(\S+)\s*.*$/ )  { push (@hosts, $1); }
	    else { print STDERR "Illegal host line $_, skipping\n" ;}
	    next ;
	}

	if ( /^POLLINTERVAL\s(\d+)/i )  { 
	    if ($1 < 60) {$sleeptime = $1 * 60; } # assume minutes
	    else {$sleeptime = $1; }
	    next;
	}

	# here if reading a variable line:
	#	 VAR host-regex  wthres ethres cthres  [reg exp]
	# Permit negative integers for 2nd/3rd vals for always failing...
	if ( /^(\S+)\s+(\S+)\s+(-?\d+)\s+(-?\d+)\s+(-?\d+)\s*(.*)\s*$/ )
	{
	    local ($re) = $2 ;	# temp variable
	    if ($2 eq '*' || $2 eq '+') { $re = '.+' } ; # convert '*' => '.+'
	    push (@thress, "$1\t$re\t$3\t$4\t$5\t$6") ;	# save the thresholds
	    $thresindex{$1} .= "$i:" ; ++$i ;  # save location in @thress array
	    ++$isknownvar{$1};
	    next ;
	}

	# here if bad line
	print STDERR "Bad config line, ignoring- $_\n" ;
    }		# end: while (CONFIG)
    close (CONFIG);

    if ($#hosts < 0) {    # this will be -1 if no hosts (perl wierdness ?)
	print STDERR "No hosts in the config file, setting to 'localhost'\n";
	push (@hosts, "localhost"); # hope its set to the loopback 127.0.0.1
    }
    ## delete the trailing ':' from the $thresindex list
    #foreach ( keys %thresindex ) { 
    # chop ($thresindex{$_}) ;
    #}

    # Store initial values in the %curvar array for
    #   isok value thres maxseverity comment(addr) unit
    $curvar{$progvar} = "1\t0\t1\t$E_INFO\tUnInit\tUnSet" ;
    foreach ( keys %thresindex ) {
	$curvar{$_} =   "1\t0\t1\t$E_INFO\tUnInit\tUnSet" ;
	++$numknownvars ;
    }

    if ($debug > 1) {	# extended debuggin
	foreach (keys %thresindex) {
	    print STDERR "(dbg) thresindex{$_}  = $thresindex{$_}\n";
	}
    }
    if ($debug) {
	print STDERR "(dbg) Total variables= $numknownvars\n" ;
	print STDERR "(dbg) Hosts are: ";
	foreach (@hosts) {  print STDERR "$_  "; }
	print STDERR "\n";
	print STDERR "(dbg) Poll/Sleep time= $sleeptime secs\n";
	print STDERR "(dbg) Threshold table is:\n";
	foreach (0..$#thress) { 
	    print STDERR "\t thress[$_] = $thress[$_]\n" ;}
    }

}	# readconf ()

## Foreach data line, see which config line it relates to and compare
## the thresholds.
##
sub do_datafile {
    local ($hdfile) = @_ ;

    $okdatatime = 0 ;	# the first data line should be a TIME line

    # zero out the count of datalines for each variable
    foreach (keys %thresindex) { $vardatalines{$_} = 0; }

    open (DFILE, "< $hdfile");
    while (<DFILE>)
    {
	chop ;			# This is the *dataline*
	# var value units comment
	if (/^\s*(\S+)\s+(\S+)\s+(\S+)\s*$/)
	{
	  ($debug > 1) && print STDERR "(dbg) DataLine = $_\n" ;
	  &do_line($1, $2, $3, "");
	}
	elsif ( /^\s*(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s*$/ )
	{
	    ($debug > 1) && print STDERR "(dbg) DataLine = $_\n" ;
	    &do_line($1, $2, $3, $4) ;
	}
	else { 
	    print STDERR "Error: unknown input data line syntax- $_\n";
	}

	if (!$okdatatime) { last ;} # bad data time

    }		# end: while(DFILE)
    close (DFILE);

}	# end: do_datafile()

## Parse each data line. This routine is called with the various fields of
## the input dataline.
## Stores the variable value, thres, max-severity etc. in %curvar
##
sub do_line {
    local ($dvname, $dvalue, $dunit, $dcomment) = @_ ;
    local ($isok, $varthres, $maxsev) = (0, 0, $E_CRITICAL);
    local ($i);
    local ($matched) = 0;

    if ($dvname eq "TIME")   { $okdatatime = &check_time($dvalue) ; return; }

    if ( !$isknownvar{$dvname} ) {
       # print out error message only ONCE
      if (!$isunknownvar{$dvname}) {
	print STDERR "do_line: ERROR unknown variable in data for host $host - $_\n" ;
	++$isunknownvar{$dvname} ;
      }
      return ;
    }

    # now cycle thru the indexes for the variable, and try and match the
    # dataline's comment with the regular expressions for that variable
    # as well as the regular expression for the host (read from config file).
    # No regular expression in the comment field serves as a 'default'.
    # Stop after the first match.
    
    ($debug > 1) && print STDERR "(dbg)thresindex{$dvname}= $thresindex{$dvname}\n";

    foreach $i  (split(/:/, $thresindex{$dvname}))
    {
        local ($junk, $host_regex, $t1,$t2,$t3, $regex) = 
	    split (/\t/, $thress[$i]);
	($debug > 1) && print STDERR "(dbg) check vs. thress[$i] = $thress[$i]\n";

	# Change the special char to a '|' to allow filename slashes
	if (($host =~ m|$host_regex|i) && 
	     ($regex eq '' || $dcomment =~ m|$regex|i) )
	{
	    ($debug > 1) && print STDERR "(dbg) Matched- thress[$i], $thress[$i]\n";
	    ($isok, $varthres, $maxsev)= &calc_status ($dvalue, $t1, $t2, $t3);
	    ++$matched ;
	    last ;	# dont search thru anymore patterns of foreach()
	}	 
    }	# end foreach()
	
    if ($matched > 0) {
      ++$vardatalines{$dvname};
    }
    else	# no matches, missing default config
    {
	if (!$nodefaultvar{$dvname}) { # warning first time only
	    print STDERR "do_line: ERROR- no 'default' config for $dvname\n";
	    ++$nodefaultvar{$dvname};
	}
	return ;
    }

    ## Now have to look for the worst of all the status's in all lines
    ## with the same variable name.

    if ($vardatalines{$dvname} == 1)  {  # first dataline for this variable
        ($debug > 1) && print STDERR "(dbg) Initing curvar{$dvname}\n";
	$curvar{$dvname}= 
	    "$isok\t$dvalue\t$varthres\t$maxsev\t$dcomment\t$dunit";
    }
    else			# compare with old values, keep worst
    {
	local ($tisok,$tval,$tvarthres,$tmaxsev,$tdcomment,$tunit) =
	    split(/\t/, $curvar{$dvname}) ;

	if ( $isok < $tisok  || $maxsev < $tmaxsev ) {
	    $curvar{$dvname} = 
		"$isok\t$dvalue\t$varthres\t$maxsev\t$dcomment\t$dunit";
	    ($debug > 1) && print STDERR "(dbg) New curvar{$dvname} values= $curvar{$dvname}\n";
	}
	elsif ((!$isok) && ($maxsev == $tmaxsev) ) {  # append comment for info
	    $tdcomment = "$tdcomment+$dcomment";
	    $curvar{$dvname} =
		"$tisok\t$tval\t$tvarthres\t$tmaxsev\t$tdcomment\t$tunit";
	    ($debug > 1) && print STDERR "(dbg) New curvar{$dvname} values= $curvar{$dvname}\n";
	}
	else {
	  ($debug > 1) && print STDERR "(dbg) curvar{$dvname} not changed\n";
	}
    }

}	# end: do_line()
	    
## check to see if the data has a new timestamp.
#
sub check_time {
    local ($datatime) = @_ ;

    ($debug > 1) && print STDERR "checking old time $prevdatatime{$host} vs. $datatime\n";

    if ($prevdatatime{$host} == $datatime) { return (0); }
    else  { $prevdatatime{$host} = $datatime ; return (1) ; }
	
}

##
## Fills the first EVENT structure ($progvar = HostmonData) with the reason for
## data not being available for a host,and skips past all other variables
## for a host by lseek-ing the input and output data files. Uses $progvar
## as the index. Underlying assumption that this is the first EVENT for
## any host (thus skipping).
#
sub skip_host {
    local ($taddr, $tvunit) = @_ ;
    local ($skipbytes) = int($numknownvars * $levent) ;

    &readevent (IEVENTS, $progvar) ;

    $siteaddr{$progvar} = $taddr ; $varunits{$progvar} = "$tvunit" ;
    &update_event($progvar, 0, 0, $E_CRITICAL) ; # list status as down

    &writeevent(OEVENTS, $progvar) ;

    # skip all the other VAR fields
    ($debug > 1) && print STDERR "(dbg) Skipping $skipbytes bytes\n";
    seek (IEVENTS, $skipbytes, 1);
    seek (OEVENTS, $skipbytes, 1);

}	# end: skip_host()

##
## simple script to break out of a 'hung' telnet.
sub alrm_handler {
    local ($sig) = @_;

    $debug && print STDERR "(dbg) Got SIG$sig, probably in remote telnet\n";
    if ($rsock) {close ($rsock);} # close global socket file handler
}

###################     #################
##
## main
##

&nocol_startup ;
&readconf ;

($junk, $junk, $hostmon_port) = getservbyname($HOSTMON_SERVICE, 'tcp');
$hostmon_port = $HOSTMON_PORT  unless $hostmon_port ;

# Put initial events into the output data file.
#
open (OEVENTS, "> $datafile");	# for writing to the NOCOL datafile
foreach $host (@hosts)		# initialize the nocol structures
{
    # The first EVENT slot is used to indicate if the data is good, etc.
    # Initially it will be 'UnKnown'
    $varname = $progvar ; $varunits = "Avail" ; # used by &init_event()
    &init_event ($host, "NotInit", $progvar);
    &writeevent(OEVENTS, $progvar) ;
    
    foreach (sort keys %thresindex)	# 'sorted' to maintain same sequence
    {
	local ($tisok,$tval,$tthres,$tmaxsev,$tcomment,$tunit) = 
	    split (/\t/, $curvar{$_});
	$tcomment = "NotInit" if ($tcomment eq "") ;
	$varname = $_ ; $varunits = "$tunit" ; # used by &init_event()
	&init_event ($host, $tcomment, $_); # $comment is the site.address
	&writeevent(OEVENTS, $_) ;
    }				# end: foreach (%thresindex)
}
close (OEVENTS);

# Loop forever. Try to get the data for each site via telnetting to the
# $hostmon_port or via rcp...
#
local ($stime, $deltatime);	# outside while() to prevent memory leaks ?
local ($hdfile);		# for making name of hostmon datafile

while (1)			# forever...
{
    $stime = time;          # time starting tests

    open (IEVENTS, "< $datafile"); # for reading the NOCOL datafile
    open (OEVENTS, "+< $datafile"); # dont overwrite the NOCOL datafile
    seek (OEVENTS, 0, 0);	# seek to beginning of file
    foreach $host (@hosts)
    {
	$debug && print STDERR "Doing host $host\n";

	$hdfile = "$TMPDATADIR/$host.hostmon" ;

	## Try telnetting to the hostmon port
	if ($RCP !~ /scp/ && &doping($host) <= 0)
	{
	    $debug && print STDERR "Host $host not pingable, skipping\n" ;
	    &skip_host ("PingFailed", "Down");
	    next;		# host
	}
	if (! -e $hdfile  || -z $hdfile)
	{
	    local ($tflag) = 0 ; # needed in loop below for parsing telnet data

	    $SIG{'ALRM'}  = 'alrm_handler'; # break out of the while loop
	    alarm(15);		# 15 second timeout
	    $rsock = &newSocket($host, $hostmon_port, 'tcp');
	    if (defined($rsock)) {
		open (HDFILE, "> $hdfile") || die "Cannot write to $hdfile";
		select( (select($rsock), $| = 1)[0] ); # set socket unbuffered
		while (<$rsock>)
		{
#		    ($debug > 1)  && print STDERR "(dbg) rtelnet: $_";
		    if (/refused/)  {last;}	 # something went wrong.
		    /^TIME/ && ($tflag = 1); # start of valid data
		    if ($tflag) { print HDFILE $_ ; }
		}
		close ($rsock);
		undef $rsock;
		close (HDFILE);
	    }
	    $SIG{'ALRM'} = 'IGNORE';  alarm (0); # reset alarm
	}	# endif

	## Try RPCping. Attach '*' to remote filename
	if (! -e $hdfile  || -z $hdfile) {
	    local ($tfile) = "/tmp/" . $host . "*.hostmon" ;

	    if ($RCP !~ /scp/ &&  &rpcping($host) <= 0)
	    {
		$debug &&  print STDERR "Cannot rpcping Host $host, skipping\n";
		&skip_host ("RPCPingFailed", "Down");
		next;		# host
	    }
	    $debug && print STDERR "(dbg) Trying $RCP $host:$tfile $hdfile\n" ;
	    `$RCP $host:$tfile $hdfile >/dev/null 2>&1` ;
	}	# endif

	## Still no data ?? Oh well...
	if (! -e $hdfile || -z $hdfile) {
	    $debug && print STDERR "No datafile for $host, skipping\n";
	    &skip_host ("NoData", "NoUnit");
	    next;		# host
	}	# endif

	## Here if valid data found and in the file $hdfile
	$debug && print STDERR "Parsing datafile for $host\n";
	&do_datafile ($hdfile) ;
	unlink ($hdfile);	# not needed for next pass

	if (! $okdatatime) {
	    $debug && print STDERR "Bad/old timestamp for $host, skipping\n";
	    &skip_host ("OldData", "Secs");
	    next ;		# host
	}

	## phew !!
	if ($debug > 1)		# print out all the variable values
	{
	    print STDERR "(dbg) $host:\t\tIsOk\tValue\tThres\tMaxSev\tAddr\tUnit\n";
	    foreach (keys %thresindex) {print STDERR "\t$_ :\t$curvar{$_}\n"; }
	}

	## data ready to be written out.
	# First update the $progvar EVENT...
	$debug && print STDERR "Updating host $host\n";

	&readevent (IEVENTS, $progvar);
	
	$siteaddr{$progvar} = "OkData" ;
	$varunits{$progvar} = "Avail" ;
	&update_event($progvar, 1, 1, $E_CRITICAL) ; # status is up

	&writeevent(OEVENTS, $progvar) ;
	
	# Now have to read all the variables for this host from the nocol
	# datafile, update the EVENT structure and write them back out.
	# The 'readevent' etc. nocol lib routines operate on global arrays
	# like $sender, $sitename, etc.
	##
	foreach (sort keys %thresindex)	# 'sorted' to maintain same sequence
	{
	    local ($i) = $_ ;	# index
	    
	    &readevent (IEVENTS, $i);	# pass it the index
	    if ($vardatalines{$i} > 0)  # if variable was updated...
	    {
		local ($tisok,$tval,$tthres,$tmaxsev,$tcomment,$tunit)=
		    split (/\t/, $curvar{$i}) ;

		# Now update the address, threshold, units
		#
		if ($tcomment ne '') { $siteaddr{$i} = $tcomment; }
		 else  { $siteaddr{$i} = "-" ; } # no blank fields...
		$varthres{$i} = $tthres ;   # update the 'threshold' value
		if ("$tunit" ne '') {$varunits{$i} = "$tunit" ; }
		 else { $varunits{$i} = "NoUnit" ; }

		&update_event($i, $tisok, $tval, $tmaxsev);
	    }
	    &writeevent(OEVENTS, $i) ;
	}			# end: foreach (%thresindex)

    }	# end: foreach (host)

    close (OEVENTS);
    close (IEVENTS);

    $deltatime = time - $stime;              # time to do tests
    $debug && print STDERR "(dbg) sleep for= $sleeptime - $deltatime\n";
    if ($sleeptime > $deltatime) { sleep(($sleeptime - $deltatime)) };

}	# end: while(forever) 

