#!/usr/local/gnu/bin/perl
#
# "page-stats v1.3": a Perl-program by Mark Koenen <markko@sci.kun.nl>
# that 'cleverly' checks how often a WWW-page has been accessed.
# Changes by Patrick Atoon <patricka@cs.kun.nl>.
#
# Usage: page-stats.pl -h
#        page-stats.pl [ -b ] [ -i identfile ] [ -l logfile ]
#
# Where: 
# The pages that have to be counted are defined in the 'identfile'.
# 'logfile' gives the exact location of the log-file. 
#
# For details on the ident-file, do "page-stats.pl -h" to view the
# manual page.
#
# Latest version on : http://www.sci.kun.nl/thalia/guide/
# A working example : http://www.sci.kun.nl/thalia/page-stats/
#

# Just some defaults:
$logfile = "/vol/www/ezel/httpd/logs/access_log";
$identfile = "page-stats_en.ident";
$total_number_req = 0;
$benchmark = 0;

# Process switches:
while ($ARGV[0] =~ /^-/)
{
    $_ = shift;
    if (/^-i/) 
    {
        $identfile = shift;
    }
    elsif (/^-l/) 
    {
        $logfile = shift;
    }
    elsif (/^-h/)
    {
        &print_help;
        exit(0);
    }
    elsif (/^-b/)
    {
        $benchmark = 1;
    }
    else 
    {
        die "Unrecognized switch: $_.\n"; 
    }
}

if (! -r $logfile)
{
    die "Cannot open logfile $logfile.\n";
}

if (! -r $identfile)
{
    die "Cannot open identfile $identfile.\n";
}

# Are we benchmarking?
($u, $s) = times if ($benchmark);

# Well, the important files are there. Start generating the
# HTML-file

# Let's determine the other filenames:
$strippedfile = substr($identfile, 0, rindex($identfile, "."));
$htmlfile = $strippedfile . ".html";
$sourcefile = $strippedfile . ".source";

@test_url = ();        # (Partial) URLs to match
@true_idx = ();        # Index of true URL to which (partial) URLs belong
@true_url = ();        # True URLs
@url_desc = ();        # Description of the true URL
@urlcount = ();        # Number of hits for those true URLs

# Read the identifiers-file:
open(IDENT, $identfile);

while (<IDENT>)
{
    chop;

    s/#.*//g;     # Throw out comments
    s/\s$//g;     # Throw out spaces at the end

    next if ($_ eq "");

    @info = split("@", $_);

    # Fill in the various arrays that administrate stuff.
    $index = scalar(@true_url);
    push(@true_url, shift(@info));
    push(@url_desc, shift(@info));
    push(@urlcount, 0);

    foreach $url (@info)
    {
	push(@true_idx, $index);
	push(@test_url, $url);
    }
}

close(IDENT);
$num_true_urls = scalar(@true_url);
$num_test_urls = scalar(@test_url);
open(LOG, $logfile);

#
# Read the logfile and check if the page is recognized.
#
# This is computationally the heaviest part of the script.
# Optimizing would help a lot.
#
while (<LOG>)
{
    # Since HMTL pages almost always contain pictures, it is more
    # cost-effective to filter these pictures out before trying to
    # match these lines. Even if each page only contains one
    # picture, this would already save 50% of the lines to match!
    # In real life pages, this percentage will be much higher.
    #
    # If you want to be able to match pictures, outcomment the next
    # two lines by placing a "#" before them.
    next if (index($_, ".gif ") != -1);
    next if (index($_, ".jpg ") != -1);

    # Get URL from logline; they happen to start with " /", how handy!
    # Use index(), it's faster than using s/X/Y/.
    $begin = index($_, " /") + 1;

    next if ($begin == -1);

    # The next space marks the end of the URL.
    $end = index($_, " ", $begin);
    $pageurl = substr($_, $begin, $end-$begin);

    for ($i = 0; $i < $num_test_urls; $i++)
    {
        # Is this one with or without the wildcard?
        if (substr($test_url[$i], -1) eq "*")
        {
	    $url = $test_url[$i];
            chop($url);
            $len = length($url);

	    # If this matches the pageurl, increase the counter.
	    if (substr($pageurl, 0, $len) eq $url)
	    {
                $urlcount[$true_idx[$i]]++;
                $total_number_req++;
		last;        # No need to check others; continue with next URL.
	    }
        }
        else
        {
	    # If this is the pageurl, increase the counter.
	    if ($pageurl eq $test_url[$i])
	    {
                $urlcount[$true_idx[$i]]++;
                $total_number_req++;
		last;        # No need to check others; continue with next URL.
	    }
        }
    }
}

close (LOG);

# Calculate some variables:
$firstrequest = `head -1 $logfile`;
$firstrequest =~ s/^.*\[(\S*)\s.*$/$1/;

$lastrequest = `tail -1 $logfile`;
$lastrequest =~ s/^.*\[(\S*)\s.*$/$1/;

chop($firstrequest, $lastrequest);

# Determine the time of creation
($sec, $min, $hour, $day, $month, $year, $wday, $yday, $isdst) = localtime();
@MONTHS = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
           "Oct", "Nov", "Dec");
$date = sprintf("%02d/%s/19%02d:%02d:%02d:%02d", $day, $MONTHS[$month], $year,
                $hour, $min, $sec);

# Open HTML-file for writing
if (!open(HTML, ">$htmlfile"))
{
    die "Cannot open html file $htmlfile for writing.\n";
}

# Write SOURCE-file to HTML-file
if (open(SOURCE, $sourcefile))
{
    while (<SOURCE>)
    {
        # Replace requested variables
        s/\$firstrequest/$firstrequest/gi;
        s/\$lastrequest/$lastrequest/gi;
        s/\$date/$date/gi;

        $line = $_;

        # Insert top number pages
        if (/\$top/i)
        {
            # Get the limit
            $limit = $line;
            $limit =~ s/.*\$top([0-9]*).*$/$1/i;

            # Don't forget to erase the $topXX bit from the line.
            $line =~ s/\$top[0-9]*//gi;

            &write_top($limit);

        }

        # Insert all pages
        if ($line =~ /\$list/)
        {
            $line =~ s/\$list//gi;
            &write_pages;
        }
	print HTML $line;
    }
}
else
{
    # Generate a default page
    print HTML "<HTML>\n<HEAD>\n<TITLE>Page-statistics</TITLE>\n";
    print HTML "</HEAD>\n<BODY>\n";
    &write_pages;
    print HTML "<HR>\n<EM>Page was generated on $date</EM>\n";
    print HTML "</BODY>\n</HTML>\n";
}

close(SOURCE);
close(HTML);

if ($benchmark)
{
    ($nu, $ns) = times;
    printf "%8.4f secs user time, %8.4f secs system time.\n", ($nu - $u),
        ($ns - $s);
}

exit(0);


###############################
# Subroutines from here on
#

#
# Print all pages
#
sub write_pages
{
    local($i, $whitespace, $desc) = (0, "", "");

    # Write page-stats to HTML-file
    print (HTML "<PRE>\n");

    for ($i = 0; $i < $num_true_urls; $i++)
    {
        $desc = $url_desc[$i];
        $whitespace = $desc;
        $desc =~ s/^\s*//;
        $whitespace =~ s/^(\s*).*/$1/;
	printf HTML ("%6d %s<A HREF=\"%s\">%s</A>\n", $urlcount[$i],
	    $whitespace, $true_url[$i], $desc);
    }
 
    # print out the total number of requests
    print HTML "--------------------------------\n";
    printf HTML ("%6d  Total number of requests\n", $total_number_req);
    print HTML "</PRE>\n";
}

#
# Print the top X
#
sub write_top
{
    local($lim) = @_;
    local($i, $j, $max, $max_idx) = (0, 0, -1, -1);

    # Sanity check
    $lim = 0 if ($lim < 0);
    $lim = $num_true_urls if ($lim > $num_true_urls);

    @tmpcount = @urlcount;
    print HTML "<PRE>\n";

    for ($i = 0; $i < $lim; $i++)
    {
        $max = -1;
        $max_idx = -1;

        for ($j = 0; $j < $num_true_urls; $j++)
        {
            if ($tmpcount[$j] > $max)
            {
                $max = $tmpcount[$j];
                $max_idx = $j;
            }
        }

	if ($max > -1)
	{
            $desc = $url_desc[$max_idx];
	    $desc =~ s/^\s*//;
	    printf HTML ("%6d <A HREF=\"%s\">%s</A>\n", $tmpcount[$max_idx],
		$true_url[$max_idx], $desc);
	    $tmpcount[$max_idx] = -1;
	}
        else
        {
            last;
        }
    }

    print HTML "</PRE>\n";
}

#
# The manual is included in the program, so you can never lose it.
#
sub print_help
{
    print <<EOF;

NAME
       page-stats.pl - Check WWW page accesses (v1.3)

SYNOPSIS
       page-stats.pl -h
       page-stats.pl [ -b ] [ -i identfile ] [ -l logfile ]

DESCRIPTION
       page-stats.pl will examine the acceslog of a http daemon and search
       it for occurrences of certain references. These references are then
       counted  and  put into a HTML file that is ready to be displayed to
       the  outside  world  as  a "Page Statistics" page. Each page can be
       selected from the statistics page.

       The  identfile  contains  the  references that should be counted. A
       line in this file should be in the following format:

              URL\@title\@reference[\@reference...]

       which could look like this:

              ~gnu/index.html\@Gnu's pages\@/gnu.html\@~gnu*

       Comments  are  allowed, and should be preceded by a "#". Everything
       following that character will be ignored. Each line should at least
       contain the following:

       URL    The  URL  of  the  page, as it should be referenced from the
              "Page Statistics" page.

       title  The  title of the page, as you want visitors to see it. Note
              that  leading  spaces  are significant, so it is possible to
              make  use  of indentation for different levels of documents.

       reference
              A reference of how the page might be accessed. For instance,
              if  a  directory  contains  a  file  index.html,  it  can be
              accessed  by  leaving out the "index.html" part, or even the
              "/"  before  it.  If  this  is  the case, put all references
              behind each other, separated by "\@". You may use a wildcard
              "*"  at  the  end  of a string to match only the begin of an
              URL.

       The  order  of  the  lines in the identfile matters. Only the first
       match  will be taken into account. Be careful when using wildcards,
       as  they  might filter out hits for lines below. Take a look at the
       (faulty) example below:

              # Wrong; second line will never be reached!
              ~gnu/index.html\@Gnu's pages\@~gnu*
              ~gnu/info/index.html\@Gnu's info files\@~gnu/info*

       The  first  line will filter out all URLs ending in ".html", which
       automatically  means  that  URLs that would match /info/*.html are
       matched  as  well.  Place the second line above the first to solve
       the problem:

              # Right!
              ~gnu/info/index.html\@Gnu's info files\@~gnu/info*
              ~gnu/index.html\@Gnu's pages\@~gnu*

       Currently  page-stats.pl  will  skip  lines  in the access_log that
       contain  references to ".gif", ".jpg" or ".jpeg" files, even if you
       specify  matching  URLs.  If  you  need  the  program to be able to
       handle  references  to  those  pictures,  you should outcomment the
       lines as indicated in the code.

       Note  that  once  the  first matching reference is found, the quest
       for  matches  is ended. Only the first page will be recognized as a
       matching reference and its counter will be increased.

       The  HTML  "Page Statistics"  file is created from two files. These
       are the ident file with references to check, and a source file that
       contains  the  basic  HTML  page as desired. The name of the source
       file  is  determined  by replacing the mandatory ".ident" ending of
       the  ident file by ".source". The HTML file that is created will be
       named in the same way, ending in ".html".

       It  is  possible to use certain variables in the source file. These
       variables  will be replaced by page-stats.pl as it rummages through
       the file.

       \$date  The  current  date  and  time  will  be  inserted  for  this
              variable.

       \$firstrequest
              The  date  and  time  of  the  first  request  logged in the
              access_log will be inserted for this variable.

       \$lastrequest
              This  variable is replaced by the last request logged in the
              access_log.

       \$list  This  will  be  replaced  by the complete list of references
              and their number of hits.

       \$topN  This  will insert a sorted list of the N most visited pages,
              where  N  can  be  any  number .  Of course setting a number
              greater  than  the number of references is silly. There must
              be no space between "\$top" and the number.

OPTIONS
       -b     Benchmark;  print  used  user  and  system times when ready.

       -h     Displays this manual page.

       -i identfile
              Specify  the  file  that determines which references to look
              for  in  the  logfile.  This defaults to 'page-stats.ident'.

       -l logfile
              Specify  the  access_log  of  the  http  daemon. The default
              location is '/usr/local/httpd/logs/access_log'.

FILES
       access_log           (generated by httpd)
       <identname>.ident
       <identname>.source   (optional)
       <identname>.html     (generated by page-stats.pl)

SEE ALSO
       httpd(1).
       http://www.sci.kun.nl/thalia/guide/#page-stats
              For the latest version.
       http://www.sci.kun.nl/thalia/page-stats/
              For a working example.

CHANGES
       03-01-1995:  (v1.0) First draft of the program.
       03-17-1995:  (v1.1) Added 'total number of requests' at the bottom
                    of the page.
       05-26-1995:  (v1.2) Added  '\$topN'  and  '\$list'; juggled with the
                    code.  Improved  performance  by  skipping  images in
                    access_log.  Allowed comments in the ident file. Also
                    moved the external README into the code.
       07-17-1995:  (v1.3) You  can  now  use wildcards to define URLs to
                    recognize.  Using arrays to administrate URLs instead
                    of strings.
       
BUGS
       If  the  accesslog is big, and there are many references to check,
       this  program  can  take  very long to complete. It is recommended
       that  both  the size of the accesslog and the number of references
       are kept to acceptable levels.

       The  program  might not work because the path to Perl in the first
       line  of  page-stats.pl  is  wrong.  See if the path is correct by
       doing  'which perl' at your Unix prompt. If it is not correct, you
       will have to edit the first line.

AUTHOR
       Mark Koenen <markko\@sci.kun.nl>,
       changes by Patrick Atoon <patricka\@cs.kun.nl>

EOF
}

