#!/bin/perl

#  validate.pl - a CGI script to validate changed HTML & CSS files
#                on a Web site
#
#
#  The purpose of this script is to make checking the validity of a
#  site's HTML & CSS easier. It employs the online HTML & CSS
#  validators offered by the World Wide Web consortium at w3.org to
#  do the validation.
#
#  This script:
#
#    1. Finds all files that have changed since the last time
#       the script was run.
#
#    2. Submits each file to the appropriate validator.
#
#    3. Saves each file's results, aftering cleaning them up a
#       little.
#
#    4. Returns an HTML page with a list of valid files and a list
#       of files with errors. If there are files with errors, the
#       details of each error is listed.
#
#  If there are no files that have changed since the last time the
#  script was run, a message to that effect is printed.
#
#  If there are
#
#  Requirements:
#
#    1. This script must be run from a Web server's CGI directory. It
#       returns an HTML page viewable in any browser.
#
#    2. A dummy file whose timestamp is used to record the last time
#       the scripts was run. This file must be created manually. See
#       the "globals" section.
#
#    3. The availability of the Unix utilities 'touch' and 'find'.
#       Free versions for Windowsare available from CygWin.
#
#  Customization: the lines with !!! comments can be customized.
#
#  This script has been tested with several browsers, Windows 95,
#  the CygWin utilities, Perl 5.6.0, and the TinyWeb server.
#
#  Mark L. Irons
#  2 August 2002


use LWP::Simple;



#----------------------------------------------------------------------
# globals
#----------------------------------------------------------------------

# customize these for your site

$WebHome = "/www/";                                                 # !!!
$validationTimestampFileName = "timestamp.of.last.validation";      # !!!
$validationTimestampFile = $WebHome."timestamp.of.last.validation"; # !!!


# define validators & associated text strings to search their output for
# (these shouldn't need modification)

# HTML

$Validator{"HTML"}      = "http://validator.w3.org/check?uri=";
$GoodMessage{"HTML"}    = "No errors found!</pre>";
$OpenDelimiter{"HTML"}  = "<ul>";
$CloseDelimiter{"HTML"} = "</ul>";

# CSS

$Validator{"CSS"}       = "http://jigsaw.w3.org/css-validator/validator?uri=";
$GoodMessage{"CSS"}     = "<hr><h2>No error or warning found</h2>";
$OpenDelimiter{"CSS"}   = "<div id=\"errors\">";
$CloseDelimiter{"CSS"}  = "\n\n</div>\n\n";



#----------------------------------------------------------------------
# Exit with message if the timestamp file doesn't exist.
#----------------------------------------------------------------------

if (!-e $validationTimestampFile) {
  &printPreamble;
  print <<EndOfNoTimestamp;
<H1>Validation Failed</H1>
<P>The timestamp file <CODE>$validationTimestampFile</CODE> doesn't exist.
You need to create it.</P>
EndOfNoTimestamp
  &printPostamble;
  exit;
}



#----------------------------------------------------------------------
# Get list of files that have changed since the last time this script
# was run
#----------------------------------------------------------------------

open(NEWURLS,"find $WebHome ( -name '*.html' -or -name '*.css' ) -newer $validationTimestampFile |")
  || die "Couldn't find changed files: $!\n";
while (<NEWURLS>) {
  chop;
  push(@URLsToValidate,$_);
}
close(NEWURLS);



#----------------------------------------------------------------------
# If there's nothing to validate, tell the user so, update the
# timestamp, and exit.
#----------------------------------------------------------------------

if (scalar(@URLsToValidate) == 0) {
  &printPreamble;
  &updateTimestamp;
  print <<EndOfNothingToValidate;
<H1>No Files Need Validation</H1>
EndOfNothingToValidate
  &printPostamble;
  exit;
}



#----------------------------------------------------------------------
# Submit each URL to its appropriate validator, and save the results
#----------------------------------------------------------------------

foreach $url (sort @URLsToValidate) {

  # determine the file type, HTML or CSS

  if    ($url =~ /\.html$/) { $filetype = "HTML"; }
  elsif ($url =~ /\.css$/)  { $filetype = "CSS";  }

  # submit to validator

  $_ = get($Validator{$filetype}.$url);

  # process the results
  #
  #

  if (/$GoodMessage{$filetype}/) {
    push(@goodURLs,$url);                          # add to good list
  }
  else {

    # pull out the error messages and save them

    # for HTML, error messages are everything inside <ul></ul>
    # for CSS,  error messages are everything inside <div id="errors"></div>

    $pattern = "($OpenDelimiter{$filetype}.*?$CloseDelimiter{$filetype})";
    /$pattern/s;

    # at the top of each HTML error section, print the URL for clarity

    if ($filetype eq "HTML") {
      $URLerrors{$url} = "<h3>URI : <a href=\"$url\">$url</a></h3>"
    }
    $errorHTML = $1;

    # remove the now-redundant "Errors" header from the CSS results

    $errorHTML =~ s#<h2>Errors</a></h2>##;

    # remove line links from HTML error sections

    $errorHTML =~ s/<a href="#line-(\d+)">\1<\/a>/\1/g;

    # save the error report for each URL

    $URLerrors{$url} .= $errorHTML;
    push(@badURLs,$url);                          # add to bad list
  }
}



#----------------------------------------------------------------------
# Create the page of results
#----------------------------------------------------------------------

&printPreamble;

&updateTimestamp;

print "<H1>Validation Results</H1>\n";

$numgood = scalar(@goodURLs);
$numbad  = scalar(@badURLs);

if ($numgood > 0) {
  print "<H2 style=\"background-color: #E0FFE0; padding: .2em\">Okay</H2>\n<OL>\n";
  while ($url = shift @goodURLs) {
    print "  <LI><A HREF=\"$url\">$url</A></LI>\n";
  }
  print "</OL>\n";
}

if ($numbad > 0) {
  print "<H2 style=\"background-color: #FFE0E0; padding: .2em\">Files Containing Errors</H2>\n<OL>\n";
  $counter = 0;
  while ($url = shift @badURLs) {
    $urllist .= "  <LI style=\"margin: 1px\"><A HREF=\"$url\" NAME=\"Ref$counter\">$url</A> <A HREF=\"#$counter\"style=\"background-color: #606060; color: #FFFFFF; font-family: Arial Black; font-size: smaller\">&nbsp;details&nbsp;</A></LI>\n";
    $errors .= "<A NAME=\"$counter\"></A>";
    $errors .= $URLerrors{$url};
    $errors .= "<A HREF=\"#Ref$counter\" style=\"background-color: #606060; color: #FFFFFF; padding: 1px; font-family: Arial Black\">&nbsp;Back&nbsp;</A><P></P><HR>";
    $counter++;
  }
  print "$urllist\n</OL>\n<HR>\n$errors\n";
}

&printPostamble;



#======================================================================
# SUB updateTimestamp
#======================================================================

# Touch the validation timestamp file; write an error if the attempt
# fails

sub updateTimestamp {
  $result = system("touch $validationTimestampFile");
  if ($result != 0) {
    print "<P style=\"background-color: #FFF0F0; border: thin solid #FFB0B0; padding: 1em\"><STRONG>Couldn't touch the timestamp file <CODE>$validationTimestampFile</CODE>. Please touch this file manually.</STRONG></P>";
  }
}



#======================================================================
# SUB printPreamble
#======================================================================

sub printPreamble {
  print <<EndPreamble;
Content-type: text/html

<HTML>
<HEAD>
  <TITLE>URL Validation Report</TITLE>
  <META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=ISO-8859-1">
</HEAD>
<BODY>

EndPreamble
}


#======================================================================
# SUB printPostamble
#======================================================================

sub printPostamble {
print <<EndPostamble;

</BODY>
</HTML>
EndPostamble
}