#  indexSite.pl
#
#  Mark L. Irons
#  16-18 March 2001
#
#
#  Process a list of HTML files, indexing the keywords listed
#  in the <META NAME="keywords"> tag.
#
#  The output is an HTML page presenting a human-readable
#  keyword index.
#
#  HTML files with a <META name="robots"> tag that indicates a page
#  shouldn't be indexed (e.g., CONTENT="none" or "noindex") will
#  not be indexed.
#
#  INPUT   a list of HTML files to index.
#
#  OUTPUT  an index, in HTML.
#
#  KNOWN BUGS
#
#  If a term to index is a Perl keyword, is the name of one of the
#  variables in this script, or has special meaning in a regular
#  expression, then this script may create erroneous entries.
#  Avoid using '*', "termsToIndex", and so on as keywords in
#  your HTML pages' META tags.
#
#  REVISION HISTORY
#
#  2003-06-28  Changed keyword and description sorts to be
#              case-insensitive.
#
#--------------------------------------------------------------------
#
#  Patterns to match.
#
#  METAKeywordsPattern looks like
#      '<META NAME="keywords" CONTENT="foo,bar,baz,bug,mujava">'
#
$METAKeywordsPattern = '<META NAME="keywords"\s+CONTENT="(.+)">';
#
#  METADescriptionPattern looks like
#      '<META NAME="decription" CONTENT="A complex yet honest page">'
#
$METADescriptionPattern = '<META NAME="description"\s+CONTENT="(.+)">';
#
#  Pattern for META tags that prevent robots from indexing a page
#
$METARobotNoIndexPattern = '<META NAME="robots"\s+.*NO(INDEX|NE).*">';
#
#--------------------------------------------------------------------
#
#  Get current date and convert it to ISO format
#
($s,$m,$h,$day,$month,$year,$w,$y,$d) = localtime;
$year=$year+1900;                           # living in 21st century, not 20th
$month=$month+1;                            # Jan is 1st month, not 0th
if ($month < 10) { $month = "0".$month; }   # pad month to two digits
if ($day   < 10) { $day   = "0".$day;   }   # pad day to two digits
$ISOdate = $year."-".$month."-".$day;       # put it all together
#
#--------------------------------------------------------------------
#
#  Array of months
#
@months = ('January','February','March','April','May','June',
           'July','August','September','October','November','December');
#
#--------------------------------------------------------------------
#
#  Data structures / variables
#
@ignoredKeywords = ( "Mark Irons", "Mark L. Irons", "half", "Half", "mark irons", "mark l. irons" );
#
#  'entries' holds all the keywords to index on. It's an associative
#  array, initially empty. The entries have no values; we just use
#  an associative array to hold the keywords so that we can do fast
#  lookups to see if a given keyword's already in the list.
#
%entries = ( );
#
#  Likewise, 'files' is an associative array that's indexed by a URL.
#  The value for a given URL is the META description of that file.
#
%files = { };
#
#--------------------------------------------------------------------

#
#  Loop over files, processing each.
#

PROCESSFILE:
while (<>) {
  chop;
  $keywords = "";
  $description = "";
  $filename = $_;
  $filename =~ s/\.\///;                # remove leading ./
  if (!open(HTMLFILE,$filename)) {
    chop $filename;
    warn "Can't open $filename, skipping: $!\n";
    next;
  }
  while (<HTMLFILE>) {
    if (/$METARobotNoIndexPattern/) {
      next PROCESSFILE;
    }
    elsif (/$METAKeywordsPattern/) {       # check for META keywords tag
      $keywords = $1;
      $keywords =~ s/\s*,\s*/,/g;       # remove spaces before & after commas
      @termsToIndex = split(',',$keywords);    # split the keywords into an array
      $k = 0;
      foreach $i (@ignoredKeywords) {   # remove keywords to ignore
        foreach $j (@termsToIndex) {
          if ($i eq $j) {
            splice(@termsToIndex,$k,1);
            last;
          }
          else { $k++; }
        }
      }
    }
    elsif (/$METADescriptionPattern/) { # check for META description tag
      $description = $1;
    }
  } # done processing single file
  close(HTMLFILE);                      # close the input file
  #
  #  If the keywords and descriptions aren't null, process the keywords.
  #
  if (length($keywords)+length($description) > 0) {
    $files{$filename} = $description;                 # save file info
    foreach $kw (@termsToIndex) {                            # for each keyword...
      if (!defined $entries{$kw}) {                   #   if not seen before
        $entries{$kw} = '';                           #     add it to keyword list
      }                                               #
      push(@$kw,$filename);                           #   add file to that kw's list
    }
  }
} # done all files

#
#  Write the index file.
#

print STDOUT <<"EndOfHTMLPreamble1";
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">

<HTML>

<HEAD>
  <TITLE>Index to Mark L. Irons' Web site</TITLE>

  <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=ISO-8859-1">
  <META NAME="author"       CONTENT="Mark L. Irons">
  <META NAME="created"      CONTENT="2001-03-17">
EndOfHTMLPreamble1
print STDOUT "  <META NAME=\"date\"         CONTENT=\"",$ISOdate,"\">\n";
print STDOUT <<"EndOfHTMLPreamble2";
  <META NAME="description"  CONTENT="Index to Mark L. Irons' Web site">
  <META NAME="keywords"     CONTENT="Mark Irons, Mark L. Irons, home, home page, site, index">

  <LINK REL="STYLESHEET" TYPE="text/css" HREF="base-style.css">
</HEAD>

<BODY>

<!-- breadcrumbs -->
<P class="breadcrumbs">
<SMALL>
  <A HREF=\"index.html\">MLI home</A>
  <IMG SRC="Icons/small.arrow.outline.gif" ALT="->" WIDTH=18 HEIGHT=9 HSPACE=2>
  Site Index
</SMALL>
</P>

<H1>Site Index</H1>

<HR>

EndOfHTMLPreamble2

#
#  Loop over keywords, writing HTML
#

foreach $key (sort {uc($a) cmp uc($b)} keys(%entries)) {
  print STDOUT "<P><STRONG>",$key,"</STRONG></P>\n<UL>\n";
  foreach $fname (sort {uc($a) cmp uc($b)} @$key) {
    print STDOUT "  <LI><A HREF=\"",$fname,"\">",$files{$fname},"</A></LI>\n";
  }
  print STDOUT "</UL>\n";
}
print STDOUT <<"EndOfHTMLPostamble1";

<HR>

<!-- Footer -->
<DIV class="colophon">
EndOfHTMLPostamble1
#
#  Print readable date
#
if (substr($day,0,1) eq "0") {    # remove leading 0 in day
  $day = substr($day,1,1);
}
$month = $months[$month-1];
$readableDate = "$day $month $year";
print STDOUT "<P><SMALL>Last updated $readableDate<BR>";

print STDOUT <<"EndOfHTMLPostamble2";
<TT class="pageURL">http://www.rdrop.com/~half/siteIndex.html</TT><BR>
EndOfHTMLPostamble2

print STDOUT "All contents of this Web site &copy;2001-$year <A HREF=\"Personal/Life/ContactInformation.html\">Mark L. Irons</A>.</SMALL></P>\n</DIV>";

print STDOUT <<"EndOfHTMLPostamble3";

<!-- breadcrumbs -->
<P class="breadcrumbs">
<SMALL>
  <A HREF=\"index.html\">MLI home</A>
  <IMG SRC="Icons/small.arrow.outline.gif" ALT="->" WIDTH=18 HEIGHT=9 HSPACE=2>
  Site Index
</SMALL>
</P>

</BODY>
</HTML>
EndOfHTMLPostamble3
