#!/usr/bin/perl

#####################################################################
#
# WDG HTML Validator <http://www.htmlhelp.com/tools/validator/>
# by Liam Quinn <liam@htmlhelp.com>
#
# Copyright (c) 1998-2007 by Liam Quinn
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# Contributors:
#    Darxus@ChaosReigns.com
#
#####################################################################

#####################################################################
# Required libraries #
######################

use CGI;
use LWP::UserAgent;
use SpiderUA;
use HTMLLinkExtractor;
use URI;
use WWW::RobotRules;
use I18N::Charset;
use Unicode::Map8;
use Unicode::String qw(utf8 utf16 ucs2);
use POSIX;

#####################################################################

#####################################################################
# Variables to define #
#######################

my $version = '1.6.5';

# User-Agent string and contact address for spidering validation
my $spiderUA = "WDG_SiteValidator/$version";
my $spiderFrom = 'liam@htmlhelp.com';

# User-Agent string for normal validation
my $normalUA = "WDG_Validator/$version";

# HTTP Accept header to send
my $acceptHeader = 'text/html,application/xhtml+xml;q=0.9,*/*;q=0.5';

# Location of jconv Japanese character encoding converter
my $jconv = '/usr/local/bin/jconv';

# Location of cjkvconv.pl CJK character encoding converter
my $cjkvconv = '/usr/local/bin/cjkvconv.pl';

# SGML search path: directories containing the catalog, DTDs,
#                   SGML declarations, and the file to be validated
my $sgmlSearchPath = '/usr/local/www/data/tools/validator/lib:/tmp:/var/tmp';

# Directory containing templates
my $templates = '/usr/local/www/cgi-bin/templates/validator';

# nsgmls command line
# The SGML declaration and name of the temporary file storing the retrieved
# document will be appended to this string
my $nsgmls = '/usr/local/bin/lq-nsgmls -E50 -s -R';

# Warnings to pass on command-line to nsgmls, if desired
my $nsgmlsWarnings = '-wnon-sgml-char-ref -wmin-tag';
my $nsgmlsXMLWarnings = '-wxml';

# nsgmls "errors" that are not reported unless warnings are requested.
# These are true errors in XML validation, but they should only be
# reported as warnings otherwise.
my %errorAsWarning = (
  ' net-enabling start-tag not supported in {{XML}}' => 1,
  ' unclosed start-tag' => 1,
  ' unclosed end-tag' => 1
);

# Catalog files for HTML and XHTML
my $htmlCatalog = "catalog";
my $xhtmlCatalog = "xhtml.soc";

# Text preceding identification of the document checked
my %documentChecked = (
  # English
  'en' => 'Document Checked'
);

# Text preceding identification of the character encoding
my %characterEncoding = (
  # English
  'en' => '<a href="/tools/validator/charset.html">Character encoding</a>:'
);

# Text preceding the level of HTML checked
my %levelOfHTML = (
  # English
  'en' => '<a href="/tools/validator/doctype.html">Level of HTML</a>:'
);

# Text indicating that only a check for well-formedness was performed
my %wellformednessCheck = (
  #English
  'en' => 'Checked for <strong><a href="http://www.w3.org/TR/REC-xml#dt-wellformed">well-formedness</a> only</strong> (no <a href="/tools/validator/doctype.html">DOCTYPE</a> found)'
);

# Default DOCTYPE for forgetful users
my $defaultDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
   "http://www.w3.org/TR/html4/loose.dtd">';

# Default DOCTYPE if the document contains frames
my $defaultFramesetDoctype = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"
   "http://www.w3.org/TR/html4/frameset.dtd">';

# Error for missing DOCTYPE
my %noDoctype = (
  # English error message
  'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Transitional}}"
);

# Error for missing DOCTYPE in a Frameset document
my %noFramesetDoctype = (
  # English error message
  'en' => "missing {{document type declaration}}; assuming {{HTML 4.01 Frameset}}"
);

# Message if the document is valid
my %noErrors = (
  # English
  'en' => 'Congratulations, no errors!'
);

# Text to precede an error message
my %preError = (
  # English
  'en' => 'Error:'
);

# Text to precede a warning message
my %preWarning = (
  # English
  'en' => 'Warning:'
);

# Heading for errors (not used if warnings are desired)
my %errorsHeading = (
  # English
  'en' => 'Errors'
);

# Heading for errors and warnings (only used if warnings are desired)
my %errorsAndWarningsHeading = (
  # English
  'en' => 'Errors and Warnings'
);

# Heading for input listing
my %inputHeading = (
  # English
  'en' => 'Input'
);

# Text to precede line number
my %lineNumberText = (
  # English
  'en' => 'Line '
);

# Text to precede character number
my %characterNumberText = (
  # English
  'en' => 'character '
);

# Mapping from IANA charset name to preferred MIME name
my %MIMECharset = (
  'ISO_8859-1:1987' => 'ISO-8859-1',
  'ISO_8859-2:1987' => 'ISO-8859-2',
  'ISO_8859-3:1988' => 'ISO-8859-3',
  'ISO_8859-4:1988' => 'ISO-8859-4',
  'ISO_8859-5:1988' => 'ISO-8859-5',
  'ISO_8859-6:1987' => 'ISO-8859-6',
  'ISO_8859-6-E' => 'ISO-8859-6-e',
  'ISO_8859-6-I' => 'ISO-8859-6-i',
  'ISO_8859-7:1987' => 'ISO-8859-7',
  'ISO_8859-8:1988' => 'ISO-8859-8',
  'ISO_8859-8-E' => 'ISO-8859-8-e',
  'ISO_8859-8-I' => 'ISO-8859-8-i',
  'ISO_8859-9:1989' => 'ISO-8859-9',
  'ISO-8859-10' => 'ISO-8859-10',
  'iso-8859-13' => 'ISO-8859-13',
  'iso-8859-14' => 'ISO-8859-14',
  'ISO-8859-15' => 'ISO-8859-15',
  'UTF-8' => 'UTF-8',
  'UTF-16' => 'UTF-16',
  'UTF-16BE' => 'UTF-16BE',
  'UTF-16LE' => 'UTF-16LE',
  'ISO-2022-JP' => 'ISO-2022-JP',
  'Extended_UNIX_Code_Packed_Format_for_Japanese' => 'EUC-JP',
  'EUC-KR' => 'EUC-KR',
  'GB2312' => 'GB2312',
  'Shift_JIS' => 'Shift_JIS',
  'Big5' => 'Big5',
  'windows-1250' => 'windows-1250',
  'windows-1251' => 'windows-1251',
  'windows-1252' => 'windows-1252',
  'windows-1253' => 'windows-1253',
  'windows-1254' => 'windows-1254',
  'windows-1255' => 'windows-1255',
  'windows-1256' => 'windows-1256',
  'windows-1257' => 'windows-1257',
  'windows-1258' => 'windows-1258',
  'KOI8-R' => 'KOI8-R',
  'KOI8-U' => 'KOI8-U',
  'IBM866' => 'cp866',
  'cp874' => 'cp874',
  'CP874' => 'cp874',
  'TIS-620' => 'TIS-620',
  'VISCII' => 'VISCII',
  'VPS' => 'x-viet-vps',
  'TCVN-5712:1993' => 'x-viet-tcvn', # likely IANA name if it's ever registered
  'TCVN-5712' => 'x-viet-tcvn', # name returned by I18N::Charset now through Unicode::Map8
  'ANSI_X3.4-1968' => 'US-ASCII',

  # Some versions of I18N::Charset return incorrect IANA charset names
  'ISO-8859-1' => 'ISO-8859-1',
  'ISO-8859-1-Windows-3.1-Latin-1' => 'ISO-8859-1',
  'ISO-8859-2-Windows-Latin-2' => 'ISO-8859-2',
  'ISO-8859-9-Windows-Latin-5' => 'ISO-8859-9'
);

# Mapping from preferred MIME name to name required by nsgmls
my %encodings = (
  'US-ASCII' => 'ISO-8859-1',
  'ISO-8859-1' => 'ISO-8859-1',
  'ISO-8859-2' => 'ISO-8859-2',
  'ISO-8859-3' => 'ISO-8859-3',
  'ISO-8859-4' => 'ISO-8859-4',
  'ISO-8859-5' => 'ISO-8859-5',
  'ISO-8859-6' => 'ISO-8859-6',
  'ISO-8859-6-e' => 'ISO-8859-6',
  'ISO-8859-6-i' => 'ISO-8859-6',
  'ISO-8859-7' => 'ISO-8859-7',
  'ISO-8859-8' => 'ISO-8859-8',
  'ISO-8859-8-e' => 'ISO-8859-8',
  'ISO-8859-8-i' => 'ISO-8859-8',
  'ISO-8859-9' => 'ISO-8859-9',
  'UTF-8' => 'UTF-8',
  'EUC-JP' => 'EUC-JP',
  'EUC-KR' => 'EUC-KR',
  'GB2312' => 'GB2312',
  'Big5' => 'Big5',
  'Shift_JIS' => 'Shift_JIS',

  # The following character encodings will be converted to UTF-8 for
  # parsing by nsgmls.  UTF-16, UTF-16BE, UTF-16LE, and ISO-2022-JP aren't
  # included here since they are converted to other encodings at an earlier
  # stage.
  'ISO-8859-10' => 'UTF-8',
  'ISO-8859-13' => 'UTF-8',
  'ISO-8859-14' => 'UTF-8',
  'ISO-8859-15' => 'UTF-8',
  'windows-1250' => 'UTF-8',
  'windows-1251' => 'UTF-8',
  'windows-1252' => 'UTF-8',
  'windows-1253' => 'UTF-8',
  'windows-1254' => 'UTF-8',
  'windows-1255' => 'UTF-8',
  'windows-1256' => 'UTF-8',
  'windows-1257' => 'UTF-8',
  'windows-1258' => 'UTF-8',
  'KOI8-R' => 'UTF-8',
  'KOI8-U' => 'UTF-8',
  'cp866' => 'UTF-8',
  'cp874' => 'UTF-8',
  'TIS-620' => 'UTF-8',
  'VISCII' => 'UTF-8',
  'x-viet-vps' => 'UTF-8',
  'x-viet-tcvn' => 'UTF-8'
);


# Hash table of multibyte character encodings supported.  UTF-16, UTF-16BE,
# UTF-16LE, and ISO-2022-JP are not included here since we convert them
# to other encodings for processing.
#
# The value is a regular expression representing a single character
# in the encoding.
my %multibyte = (
  'UTF-8' => '[\x00-\x7F]|[\xC2-\xDF][\x80-\xBF]|\xE0[\xA0-\xBF][\x80-\xBF]|[\xE1-\xEF][\x80-\xBF][\x80-\xBF]|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]|[\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|[\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]|\xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]',
  'EUC-JP' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]|\x8E[\xA0-\xDF]|\x8F[\xA1-\xFE][\xA1-\xFE]',
  'EUC-KR' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
  'GB2312' => '[\x00-\x7F]|[\xA1-\xFE][\xA1-\xFE]',
  'Big5' => '[\x00-\x7E]|[\xA1-\xFE][\x40-\x7E\xA1-\xFE]',
  'Shift_JIS' => '[\x00-\x7F]|[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]|[\xA0-\xDF]'
);


# Hash table of character encodings that must be converted before validation.
# UTF-16, UTF-16BE, UTF-16LE, and ISO-2022-JP aren't included here since they
# are converted to other encodings at an earlier stage.
my %conversionNeeded = (
  'ISO-8859-10' => 1,
  'ISO-8859-13' => 1,
  'ISO-8859-14' => 1,
  'ISO-8859-15' => 1,
  'windows-1250' => 1,
  'windows-1251' => 1,
  'windows-1252' => 1,
  'windows-1253' => 1,
  'windows-1254' => 1,
  'windows-1255' => 1,
  'windows-1256' => 1,
  'windows-1257' => 1,
  'windows-1258' => 1,
  'KOI8-R' => 1,
  'KOI8-U' => 1,
  'cp866' => 1,
  'cp874' => 1,
  'TIS-620' => 1,
  'VISCII' => 1,
  'x-viet-vps' => 1,
  'x-viet-tcvn' => 1
);

# Versions of HTML associated with a given FPI
my %HTMLversion = (
  'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN"' => 'XHTML-MP 1.2',
  'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.1//EN"' => 'XHTML-MP 1.1',
  'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"' => 'XHTML-MP 1.0',
  'PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"' => 'XHTML+RDFa 1.0',
  'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"' => 'XHTML 1.1 plus MathML 2.0 plus SVG 1.1',
  'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"' => 'XHTML 1.1 plus MathML 2.0',
  'PUBLIC "-//W3C//DTD MathML 2.0//EN"' => 'MathML 2.0',
  'PUBLIC "-//W3C//DTD XHTML 1.1//EN"' => 'XHTML 1.1',
  'PUBLIC "-//WAPFORUM//DTD WML 1.3//EN"' => 'WML 1.3',
  'PUBLIC "-//WAPFORUM//DTD WML 1.2//EN"' => 'WML 1.2',
  'PUBLIC "-//WAPFORUM//DTD WML 1.1//EN"' => 'WML 1.1',
  'PUBLIC "-//WAPFORUM//DTD WML 1.0//EN"' => 'WML 1.0',
  'PUBLIC "-//W3C//DTD XHTML Basic 1.0//EN"' => 'XHTML Basic',
  'PUBLIC "ISO/IEC 15445:2000//DTD HyperText Markup Language//EN"' => 'ISO/IEC 15445:2000',
  'PUBLIC "ISO/IEC 15445:2000//DTD HTML//EN"' => 'ISO/IEC 15445:2000',
  'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' => 'XHTML 1.0 Strict',
  'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' => 'XHTML 1.0 Transitional',
  'PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"' => 'XHTML 1.0 Frameset',
  'PUBLIC "-//W3C//DTD HTML 4.01//EN"' => 'HTML 4.01 Strict',
  'PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"' => 'HTML 4.01 Transitional',
  'PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"' => 'HTML 4.01 Frameset',
  'PUBLIC "-//W3C//DTD HTML 4.0//EN"' => 'HTML 4.0 Strict',
  'PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"' => 'HTML 4.0 Transitional',
  'PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"' => 'HTML 4.0 Frameset',
  'PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML 3.2 Draft//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML 3.2//EN"' => 'HTML 3.2',
  'PUBLIC "-//W3C//DTD HTML Experimental 970421//EN"' => 'HTML 3.2 + Style',
  'PUBLIC "-//W3O//DTD W3 HTML 3.0//EN"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML 3.0//EN//"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML 3.0//EN"' => 'HTML 3.0 Draft',
  'PUBLIC "-//IETF//DTD HTML i18n//EN"' => 'HTML 2.0 + i18n',
  'PUBLIC "-//IETF//DTD HTML//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML 2.0//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML Level 2//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML 2.0 Level 2//EN"' => 'HTML 2.0',
  'PUBLIC "-//IETF//DTD HTML Level 1//EN"' => 'HTML 2.0 Level 1',
  'PUBLIC "-//IETF//DTD HTML 2.0 Level 1//EN"' => 'HTML 2.0 Level 1',
  'PUBLIC "-//IETF//DTD HTML Strict//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML Strict Level 2//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 2//EN"' => 'HTML 2.0 Strict',
  'PUBLIC "-//IETF//DTD HTML Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1',
  'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1'
);

# SGML declarations for a given level of HTML
my %sgmlDecl = (
  'XHTML-MP 1.2' => "xhtml-basic10/xml1.dcl",
  'XHTML-MP 1.1' => "xhtml-basic10/xml1.dcl",
  'XHTML-MP 1.0' => "xhtml-basic10/xml1.dcl",
  'XHTML+RDFa 1.0' => "xhtml11/xml1n.dcl",
  'XHTML 1.1 plus MathML 2.0 plus SVG 1.1' => "xhtml11/xml1n.dcl",
  'XHTML 1.1 plus MathML 2.0' => "xhtml11/xml1n.dcl",
  'MathML 2.0' => "xhtml11/xml1n.dcl",
  'XHTML 1.1' => "xhtml11/xml1n.dcl",
  'WML 1.3' => "xhtml1/xhtml1.dcl",
  'WML 1.2' => "xhtml1/xhtml1.dcl",
  'WML 1.1' => "xhtml1/xhtml1.dcl",
  'WML 1.0' => "xhtml1/xhtml1.dcl",
  'XHTML Basic' => "xhtml-basic10/xml1.dcl",
  'ISO/IEC 15445:2000' => "15445.dcl",
  'XHTML 1.0 Strict' => "xhtml1/xhtml1.dcl",
  'XHTML 1.0 Transitional' => "xhtml1/xhtml1.dcl",
  'XHTML 1.0 Frameset' => "xhtml1/xhtml1.dcl",
  'HTML 4.01 Strict' => "HTML4.dcl",
  'HTML 4.01 Transitional' => "HTML4.dcl",
  'HTML 4.01 Frameset' => "HTML4.dcl",
  'HTML 4.0 Strict' => "HTML4.dcl",
  'HTML 4.0 Transitional' => "HTML4.dcl",
  'HTML 4.0 Frameset' => "HTML4.dcl",
  'HTML 3.2' => "HTML32.dcl",
  'HTML 3.2 + Style' => "html-970421.decl",
  'HTML 3.0 Draft' => "HTML3.dcl",
  'HTML 2.0 + i18n' => "i18n.dcl",
  'HTML 2.0' => "html.dcl",
  'HTML 2.0 Strict' => "html.dcl",
  'HTML 2.0 Level 1' => "html.dcl",
  'HTML 2.0 Strict Level 1' => "html.dcl",
  'Unknown' => "custom.dcl",
  'Unknown (XML)' => "xhtml1/xhtml1.dcl"
);

# XHTML DTDs
my %xhtml = (
  'XHTML-MP 1.2' => 1,
  'XHTML-MP 1.1' => 1,
  'XHTML-MP 1.0' => 1,
  'XHTML+RDFa 1.0' => 1,
  'XHTML 1.1 plus MathML 2.0 plus SVG 1.1' => 1,
  'XHTML 1.1 plus MathML 2.0' => 1,
  'MathML 2.0' => 1,
  'XHTML 1.1' => 1,
  'WML 1.3' => 1,
  'WML 1.2' => 1,
  'WML 1.1' => 1,
  'WML 1.0' => 1,
  'XHTML Basic' => 1,
  'XHTML 1.0 Strict' => 1,
  'XHTML 1.0 Transitional' => 1,
  'XHTML 1.0 Frameset' => 1,
  'Unknown (XML)' => 1
);

# XML media types
my %xmlMediaTypes = (
    'text/xml' => 1,
    'application/xml' => 1,
    'application/xhtml+xml' => 1,
    'application/vnd.wap.xhtml+xml' => 1,
    'text/vnd.wap.wml' => 1,
    'text/x-wap.wml' => 1
);

# Files of links for a given level of HTML and a specified language
my %linksFile = (
  # English links
  'en-XHTML-MP 1.2' => "$templates/html40links.txt.en",
  'en-XHTML-MP 1.1' => "$templates/html40links.txt.en",
  'en-XHTML-MP 1.0' => "$templates/html40links.txt.en",
  'en-XHTML+RDFa 1.0' => "$templates/html40links.txt.en",
  'en-XHTML 1.1' => "$templates/html40links.txt.en",
  'en-XHTML Basic' => "$templates/html40links.txt.en",
  'en-ISO/IEC 15445:2000' => "$templates/html40links.txt.en",
  'en-XHTML 1.0 Strict' => "$templates/html40links.txt.en",
  'en-XHTML 1.0 Transitional' => "$templates/html40links.txt.en",
  'en-XHTML 1.0 Frameset' => "$templates/html40links.txt.en",
  'en-HTML 4.01 Strict' => "$templates/html40links.txt.en",
  'en-HTML 4.01 Transitional' => "$templates/html40links.txt.en",
  'en-HTML 4.01 Frameset' => "$templates/html40links.txt.en",
  'en-HTML 4.0 Strict' => "$templates/html40links.txt.en",
  'en-HTML 4.0 Transitional' => "$templates/html40links.txt.en",
  'en-HTML 4.0 Frameset' => "$templates/html40links.txt.en",
  'en-HTML 3.2' => "$templates/html32links.txt.en",
  'en-HTML 3.2 + Style' => "$templates/html32links.txt.en",
  'en-HTML 3.0 Draft' => "$templates/html30links.txt.en",
  'en-HTML 2.0 + i18n' => "$templates/html40links.txt.en",
  'en-HTML 2.0' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Strict' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Level 1' => "$templates/html20links.txt.en",
  'en-HTML 2.0 Strict Level 1' => "$templates/html20links.txt.en",
  'en-Unknown' => "$templates/html40links.txt.en",
  'en-Unknown (XML)' => "$templates/html40links.txt.en"
);

# Full link text to HTML references
my %htmlLink = (
  # English links
  'en-XHTML-MP 1.2' => 'XHTML-MP 1.2',
  'en-XHTML-MP 1.1' => 'XHTML-MP 1.1',
  'en-XHTML-MP 1.0' => 'XHTML-MP 1.0',
  'en-XHTML+RDFa 1.0' => '<a href="http://www.w3.org/TR/rdfa-syntax/">XHTML+RDFa 1.0</a>',
  'en-XHTML 1.1 plus MathML 2.0 plus SVG 1.1' => '<a href="http://www.w3.org/TR/xhtml11/">XHTML 1.1</a> plus <a href="http://www.w3.org/TR/MathML2/">MathML 2.0</a> plus <a href="http://www.w3.org/TR/SVG11/">SVG 1.1</a>',
  'en-XHTML 1.1 plus MathML 2.0' => '<a href="http://www.w3.org/TR/xhtml11/">XHTML 1.1</a> plus <a href="http://www.w3.org/TR/MathML2/">MathML 2.0</a>',
  'en-MathML 2.0' => '<a href="http://www.w3.org/TR/MathML2/">MathML 2.0</a>',
  'en-XHTML 1.1' => '<a href="http://www.w3.org/TR/xhtml11/">XHTML 1.1</a>',
  'en-WML 1.3' => '<abbr class=initialism title="Wireless Markup Language">WML</abbr> 1.3',
  'en-WML 1.2' => '<abbr class=initialism title="Wireless Markup Language">WML</abbr> 1.2',
  'en-WML 1.1' => '<abbr class=initialism title="Wireless Markup Language">WML</abbr> 1.1',
  'en-WML 1.0' => '<abbr class=initialism title="Wireless Markup Language">WML</abbr> 1.0',
  'en-XHTML Basic' => '<a href="http://www.w3.org/TR/xhtml-basic/">XHTML Basic</a>',
  'en-ISO/IEC 15445:2000' => '<a href="http://www.purl.org/NET/ISO+IEC.15445/15445.html">ISO/IEC 15445:2000</a>',
  'en-XHTML 1.0 Strict' => '<a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a> Strict',
  'en-XHTML 1.0 Transitional' => '<a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a> Transitional',
  'en-XHTML 1.0 Frameset' => '<a href="http://www.w3.org/TR/xhtml1/">XHTML 1.0</a> Frameset',
  'en-HTML 4.01 Strict' => '<a href="http://www.w3.org/TR/html401/">HTML 4.01</a> Strict',
  'en-HTML 4.01 Transitional' => '<a href="http://www.w3.org/TR/html401/">HTML 4.01</a> Transitional',
  'en-HTML 4.01 Frameset' => '<a href="http://www.w3.org/TR/html401/">HTML 4.01</a> Frameset',
  'en-HTML 4.0 Strict' => '<a href="/reference/html40/">HTML 4.0</a> Strict',
  'en-HTML 4.0 Transitional' => '<a href="/reference/html40/">HTML 4.0</a> Transitional',
  'en-HTML 4.0 Frameset' => '<a href="/reference/html40/">HTML 4.0</a> Frameset',
  'en-HTML 3.2' => '<a href="/reference/wilbur/">HTML 3.2</a>',
  'en-HTML 3.2 + Style' => '<a href="http://www.w3.org/TR/NOTE-html-970421">HTML 3.2 + Style</a>',
  'en-HTML 3.0 Draft' => '<a href="http://www.w3.org/MarkUp/html3/">HTML 3.0 Draft</a>',
  'en-HTML 2.0 + i18n' => '<a href="http://info.internet.isi.edu/in-notes/rfc/files/rfc2070.txt" title="RFC 2070: Internationalization of the Hypertext Markup Language">HTML 2.0 + <acronym title="Internationalization">i18n</acronym></a>',
  'en-HTML 2.0' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a>',
  'en-HTML 2.0 Strict' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Strict',
  'en-HTML 2.0 Level 1' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Level 1',
  'en-HTML 2.0 Strict Level 1' => '<a href="http://www.w3.org/MarkUp/html-spec/">HTML 2.0</a> Strict Level 1',
  'en-Unknown' => 'Unknown',
  'en-Unknown (XML)' => 'Unknown (XML)'
);

# Location of HTML fragments
# Each file name must end with ".xx" where "xx" is the two-letter language
# code, but the ".xx" extension must not be given in this variable
my $beginningHTML = "$templates/header.htmlf";
my $beginningHTMLwithInput = "$templates/headerWithInput.htmlf";
my $endingHTML = "$templates/footer.htmlf";

# Maximum number of extra characters to include in the HTML extract on
# either side of the source of the error
my $extraChars = 30;

# Maximum number of URLs to check in batch mode
my $maxURLs = 100;

#####################################################################

#####################################################################
#
# The rest of the script...
#
#####################################################################

# Make URI module use RFC 3986 rule for extra dot-segments
local $URI::ABS_REMOTE_LEADING_DOTS = 1;

# Flush output buffer
$| = 1;

### Get user input ###

my $query = new CGI;

# URL of document to check
my $url = &trim($query->param('url'));

# Whether or not to spider the site
my $spider = $query->param('spider');

# Whether or not to hide output from valid pages
my $hidevalid = $query->param('hidevalid');

# URLs of documents to check (for batch validation)
my $urls = &trim($query->param('urls'));

# Uploaded file
my $file = $query->param('file');

# Direct HTML input
my $area = $query->param('area');

# Whether or not to show user's input
my $input = $query->param('input');

# Character encoding of uploaded file
my $charset = $query->param('charset');

# Is the custom DTD XML?
my $xml = $query->param('xml');

# Whether or not to include warnings
my $warnings = $query->param('warnings');

#############

# Only English is currently supported
my $lang = 'en';

my $document;
my $lastModified = "";
my $outputURL = "";
my %links;
my @documentLinks = ();
my @urls = ();
my $parser;
my $multipleURLs;
my $userAgent;

# It's hard to spider a site without a URL
if ($spider && !$url) {
    undef $spider;
}

# Check which input method was used
if ($url) { # URL input

    # Check if the URL to be validated is the referring URL
    if ($url =~ /^referr?er$/i) {
        my $referer = $ENV{'HTTP_REFERER'};
        if (length($referer) > 0) {
            $url = $referer;
        } else {
            &printHeader('ISO-8859-1', $lang);
            &printFile("$beginningHTML.$lang");
            &error('Your browser did not send a referring <abbr class=initialism title="Uniform Resource Locator">URL</abbr>.');
        }
    }

    # Check if URL is valid
    if ($url =~ m#(?:https?)|(?:ftp)://\S#i) {

        if ($spider) {
            $multipleURLs = 1;
            @urls = ($url);
        } else {

            my $xmlType;

            # Fetch document
            ($url, $charset, $lastModified, undef, $xmlType) = getDocument($url);
            $outputURL = escapeHTML($url);

            if ($xmlType) {
                $xml = $xmlType;
            } else {
                $xml = $query->param('xml');
            }
        }

    } else {
        &printHeader('ISO-8859-1', $lang);
        &printFile("$beginningHTML.$lang");
        &error("Invalid <abbr class=initialism title=\"Uniform Resource Locator\">URL</abbr>: " . escapeHTML($url));
    }

} elsif (defined $area && length $area) { # HTML input directly

    $document = $area;

} elsif (defined $file && length $file) { # HTML file uploaded

    while (<$file>) {
        $document .= $_;
    }
    close $file || &badUploadError;

} elsif (!($urls && $urls =~ /\S/o)) { # No input

    &printHeader('ISO-8859-1', $lang);
    &printFile("$beginningHTML.$lang");
    &error("No input");

}

# Check for batch validation
if ($urls) { # batch validation of URLs
    # Adjust newlines
    if ($urls =~ /\n/o) {
        $urls =~ s/\r//go;
    } else {
        $urls =~ s/\r/\n/go;
    }

    $multipleURLs = 1;
    @urls = split(/\n/, $urls);

} elsif (!$spider) {
    # Add dummy URL so that the validation loop executes once
    push(@urls, 1);
}

if ($multipleURLs) {

    # Print beginning of output page
    &printHeader('UTF-8', $lang);
    if ($input) {
        &printFile("$beginningHTMLwithInput.$lang");
    } else {
        &printFile("$beginningHTML.$lang");
    }
}

my $urlsChecked = 0;
while (@urls) {

    my $url = shift @urls;

    if ($multipleURLs) {

        $document = "";
        $url = &trim($url);
        next unless $url;

        # Check if we're over our limit of URLs
        if ($urlsChecked >= $maxURLs) {
            print "<div class=checkedDocument>\n<p><em>Batch validation is limited to $maxURLs URLs at one time. The remaining URLs were not checked.</em></p>\n</div>\n";
            last;
        }

        # Check if URL is valid
        if ($url =~ m#^(?:https?)|(?:ftp)://\S#io) {

            my ($success, $xmlType);

            # Create an HTML link extractor
            if ($spider) {
                @documentLinks = ();
                $parser = HTMLLinkExtractor->new(\&linkExtractorCallback);
            }

            # Fetch document
            ($url, $charset, $lastModified, $success, $xmlType) =
                getDocument($url);
            $outputURL = escapeHTML($url);

            if ($xmlType) {
                $xml = $xmlType;
            } else {
                $xml = $query->param('xml');
            }

            next unless $success;

        } else {
            print "<div class=checkedDocument><p>Invalid <abbr class=initialism title=\"Uniform Resource Locator\">URL</abbr>: " . escapeHTML($url) . "</p></div>";
            next;
        }

    }


    # Determine character encoding of output page
    my $encodingMsg = " <li>" . $characterEncoding{$lang};

    unless ($charset) {
        # Check for UTF-16 byte-order mark
        if ($document =~ /^\xFE\xFF/o || $document =~ /^\xFF\xFE/o) {
            $charset = 'UTF-16';
        }

        # Check for XML-style charset
        elsif ($document =~ m#^\s*<\?xml\s[^>]*encoding\s*=\s*["']?([^"']+)#iso)
        {
            $charset = $1;
        }

        # Check for a META element specifying the character encoding
        elsif ($document =~ m#<META(\s[^>]*http\-equiv\s*=\s*["']?Content\-Type["']?[^>]*)>#iso) {
            my $metaAttributes = $1;
            if ($metaAttributes =~ m#\scontent\s*=\s*["']?.*[\s;]charset\s*=\s*['"]?([^"']+)#iso) {
                $charset = $1;
            }
        }
    }

    # If we don't know the charset and we're checking XML, use UTF-8.
    unless ($charset) {
        # There is some code repetition here with our DOCTYPE
        # checking code later on.  I need to think more carefully
        # about how best to do DOCTYPE and charset checking
        # for XHTML documents.
        if ($document =~ /<!DOCTYPE([^>]*)>/iso) {

            my $doctypeMeat = $1;
            if ($doctypeMeat =~ /PUBLIC\s+["']([^"']*)["']/iso) {
                my $htmlLevel = $HTMLversion{"PUBLIC \"$1\""};
                if (($htmlLevel && $xhtml{$htmlLevel})
                    || (!$htmlLevel && $xml))
                {
                    $charset = 'UTF-8';
                }
            }
        }
    }

    if ($charset) {

        my $enteredCharset = $charset;

        # Get IANA name for character encoding
        my $charsetName = iana_charset_name($charset);

        # Get preferred MIME name
        if ($charsetName) {
            $charset = $MIMECharset{$charsetName};
        } else { # Check for non-IANA charset
            $charsetName = map8_charset_name($charset);
            if ($charsetName) {
                $charset = $MIMECharset{$charsetName};
            }
        }

        if ($charsetName && $charset) {
            $encodingMsg .= " $charset</li>\n";

            if ($charset =~ /^UTF-16/o) {
                # Convert to UTF-8
                my $unicodeDocument = utf16($document);

                if ($charset eq 'UTF-16') {
                    # Check byte-order
                    $unicodeDocument->byteswap if $unicodeDocument->ord == 0xFFFE;
                } elsif ($charset eq 'UTF-16LE') {
                    $unicodeDocument->byteswap;
                }

                $document = $unicodeDocument->as_string;
                $charset = 'UTF-8';

            # This is a quick hack to add basic support for ISO-2022-JP
            } elsif ($charset eq 'ISO-2022-JP') {

                # Write document to temporary file for conversion
                my ($convTempName, $convTempFH) = getTempFile();
                print $convTempFH "$document";
                close($convTempFH);

                # Convert ISO-2022-JP document to Shift_JIS
                open(JCONV, "$jconv -ij -os < $convTempName |")
                  || &error("Server error converting ISO-2022-JP document");

                $document = "";
                while (<JCONV>) {
                    $document .= $_;
                }
                close(JCONV);

                unlink("$convTempName");

                $charset = 'Shift_JIS';

            }

        } else {
            $encodingMsg .= " " . escapeHTML($enteredCharset)
                . "--<strong>not supported, assuming ISO-8859-1</strong></li>\n";
            $charset = 'ISO-8859-1';
        }

    } else {

        $encodingMsg .= " Unknown; assuming ISO-8859-1</li>\n";
        $charset = 'ISO-8859-1';

    }

    # Print beginning of output page
    unless ($multipleURLs) {
        &printHeader($charset, $lang);
        if ($input) {
            &printFile("$beginningHTMLwithInput.$lang");
        } else {
            &printFile("$beginningHTML.$lang");
        }
    }

    my @errors; # queue of errors
    my @externalErrors; # queue of errors in an external DTD

    # Amount to decrease line count by (i.e., if we add a DOCTYPE)
    my $lineAdjust = 0;

    my $validatorInput;
    my $noValid = 0; # check for validity by default

    # Adjust line-endings (nsgmls doesn't recognize Mac newlines)
    $document =~ s#\r(?!\n)#\n#go;

    # If we're doing a batch validation, convert the input to UTF-8
    if ($multipleURLs) {
        if ($multibyte{$charset} && $charset ne 'UTF-8') {

            my $convType;
            SWITCH: {
                if ($charset eq 'Shift_JIS') {
                    $convType = 'j';
                    last SWITCH;
                }
                if ($charset eq 'GB2312') {
                    $convType = 's';
                    last SWITCH;
                }
                if ($charset eq 'Big5') {
                    $convType = 't';
                    last SWITCH;
                }
                if ($charset eq 'EUC-JP') {
                    $convType = 'e';
                    last SWITCH;
                }
                if ($charset eq 'EUC-KR') {
                    $convType = 'k';
                    last SWITCH;
                }
            }

            # Write document to temporary file for conversion
            my ($convTempName, $convTempFH) = getTempFile();
            print $convTempFH "$document";
            close($convTempFH);

            open(CONV, "$cjkvconv -i$convType -ou8 < $convTempName |") || &error("Server error running cjkvconv");

            $document = "";
            while (<CONV>) {
                $document .= $_;
            }
            close(CONV);

            unlink("$convTempName");

        } elsif ($charset ne 'UTF-8' && $charset ne 'US-ASCII') {

            my $batchmap = Unicode::Map8->new($charset)
                || &error("Server error: Unable to create character encoding map for $charset");

            # Pass through invalid characters
            $batchmap->nostrict;

            # Convert document to UTF-8
            $document = $batchmap->tou($document)->utf8;

        }

        $charset = 'UTF-8';
    }

    # Determine the level of HTML
    my $htmlLevel = 'Unknown';
    if ($document =~ /<!DOCTYPE([^>]*)>/iso) {

        my $doctypeMeat = $1;
        if ($doctypeMeat =~ /PUBLIC\s+["']([^"']*)["']/iso) {
            $htmlLevel = $HTMLversion{"PUBLIC \"$1\""} || 'Unknown';
        }

        $validatorInput = $document;

        if ($htmlLevel eq 'Unknown' && $xml) {
            $htmlLevel = 'Unknown (XML)';
        }

    } else { # Missing DOCTYPE

        # If the document is XML, just check for well-formedness,
        # not validity.
        if ($xml) {

            $htmlLevel = 'Unknown (XML)';
            $validatorInput = $document;
            $noValid = 1;
            
        } else {

            # Add a default DOCTYPE
            my ($insertedDoctype, $doctypeError);
            if ($document =~ /<FRAMESET/io) {
                $insertedDoctype = $defaultFramesetDoctype;
                $doctypeError = $noFramesetDoctype{$lang};
            } else {
                $insertedDoctype = $defaultDoctype;
                $doctypeError = $noDoctype{$lang};
            }

            # Remove byte-order mark from UTF-8 document so we don't have to
            # bother slipping the added DOCTYPE after it.
            if ($charset eq 'UTF-8') {
                $document =~ s/^\xEF\xBB\xBF//o;
            }

            $validatorInput = "$insertedDoctype" . "\n$document";

            # Calculate line adjustment
            $lineAdjust = 2;

            # Add error message
            push(@errors, "::" . (1 + $lineAdjust) . ":0:E: $doctypeError");

        }
    }

    # Determine whether we're dealing with HTML or XHTML
    if ($xhtml{$htmlLevel}) {
        $ENV{'SGML_CATALOG_FILES'} = $xhtmlCatalog;
    } else {
        $ENV{'SGML_CATALOG_FILES'} = $htmlCatalog;
    }

    # Prepare links for the specified level of HTML and language
    &loadLinks($htmlLevel, $lang);

    # Prepare an array of lines in the document for easy access to a given line
    my @lines = split(/\n/, $document);

    # If necessary, convert to a character encoding (UTF-8) recognized by nsgmls
    my $map;
    if (!($multipleURLs) && $conversionNeeded{$charset}) {

        $map = Unicode::Map8->new($charset)
            || &error("Server error: Unable to create character encoding map for $charset");

        # Pass through invalid characters
        $map->nostrict;

        # Convert document to UTF-8
        $validatorInput = $map->tou($validatorInput)->utf8;

    }

    # Put the document in a temporary file
    my ($tempname, $tempfh) = getTempFile();
    print $tempfh "$validatorInput";
    close($tempfh);

    my $noValidCmd = '';
    if ($noValid) {
        $noValidCmd = '-wno-valid';
    }

    my $warningsCmd = '';
    if ($warnings) {
        if ($xml || $xhtml{$htmlLevel}) {
            $warningsCmd = "$nsgmlsXMLWarnings";
        } else {
            $warningsCmd = "$nsgmlsWarnings";
        }
    }

    # Run the validator
    $ENV{'SGML_SEARCH_PATH'} = $sgmlSearchPath;
    $ENV{'SP_CHARSET_FIXED'} = 1;
    $ENV{'SP_ENCODING'} = $encodings{$charset};
    open(NSGMLS, "$nsgmls -b$encodings{$charset} $noValidCmd $warningsCmd $sgmlDecl{$htmlLevel} $tempname 2>&1 |")
        || &error("Server error running nsgmls");

    my $maxErrors = 0;

    # Create a queue of errors
    while (<NSGMLS>) {
        chomp;

        # Convert character encodings, if necessary
        if (defined($map)) {
            $_ = $map->to8(utf8($_)->ucs2);
        }

        my @error = split(/:/, $_, 6);

        if ($error[1] eq 'I') {
            if ($error[2] =~ /maximum number of errors/o) {
                $maxErrors = 1;
            }
        } elsif ($error[4] eq 'E' || $error[4] eq 'X') {

            # With warnings enabled in non-XML validation, some "errors"
            # reported by nsgmls are probably better reported as "warnings"
            # since they are only reported with warnings enabled.
            if ($warnings && !($xml || $xhtml{$htmlLevel})) {
                if ($errorAsWarning{$error[5]}) {
                    $error[4] = 'W';

                    # lq-nsgmls uses an XML-specific message for one of
                    # these warnings.  Let's try something more helpful
                    # for HTML.  [This still doesn't seem very good;
                    # anyone have suggestions for improvement?]
                    if ($error[5] eq " net-enabling start-tag not supported in {{XML}}") {
                        $error[5] = " net-enabling start-tag; possibly missing required quotes around an attribute value";
                    }

                    $_ = join(':', @error);
                }
            }

            push(@errors, $_);

            # If the DOCTYPE is bad, bail out
            last if ($error[5] eq " unrecognized {{DOCTYPE}}; unable to check document");

        } elsif ($error[4] eq 'W') {

            unless ($error[5] eq " characters in the document character set with numbers exceeding 65535 not supported")
            {
                # Should we separate warnings more explicitly from errors?
                # For now let's lump them together.
                push(@errors, $_);
            }

        } elsif ($error[1] =~ /^<URL>/o) { # error from external DTD

            push(@externalErrors, $_);

        } elsif (length($error[4]) > 1 # Allow secondary messages about preceding error
            && $error[3] ne 'W') # Prevent error about SGML declaration not implied with -wxml
        { 

            push(@errors, $_);

        }

    }
    close(NSGMLS);

    # Delete temporary file
    unlink $tempname;

    # Remove byte-order mark from UTF-8 document in case we output the
    # line containing the byte-order mark
    if ($charset eq 'UTF-8') {
        $lines[0] =~ s/^\xEF\xBB\xBF//o;
    }

    if ($#errors > -1 || $#externalErrors > -1 || !$hidevalid) {
        # Print the URL of the document checked
        print "<div class=checkedDocument>\n" if $multipleURLs;
        print "<h2>$documentChecked{$lang}</h2>\n<ul>\n";
        if (length $outputURL) {
            print " <li>URL: <a href=\"$outputURL\">$outputURL</a></li>\n";
        } elsif (defined $file && length $file) {
            print " <li>File: " . escapeHTML($file) . "</li>\n";
        }
        print " <li>Last modified: " . escapeHTML($lastModified) . "</li>\n" if $lastModified;

        # Print character encoding information
        print "$encodingMsg";

        # Print level of HTML checked
        print " <li>$levelOfHTML{$lang} <strong>" . $htmlLink{"${lang}-$htmlLevel"} . "</strong></li>\n";
        
        if ($noValid) {
            print " <li>$wellformednessCheck{$lang}</li>\n";
        }
        print "</ul>\n";
    }

    # Report errors
    if ($#errors > -1 || $#externalErrors > -1) {

        if ($warnings) {
            print "<h2>$errorsAndWarningsHeading{$lang}</h2>\n";
        } else {
            print "<h2>$errorsHeading{$lang}</h2>\n";
        }

        print "<ul>\n";

        foreach (@externalErrors) {
            my @error = split(/:/, $_, 7);

            # Determine URL containing the error
            my $errorURL;
            if ($error[1] =~ /<URL>(.+)/o) {
                $errorURL = "$1:$error[2]";
            }

            my $lineNumber = $error[3];
            my $character = $error[4] + 1;

            my $escapedURL = escapeHTML($errorURL);
            print "<li><a href=\"$escapedURL\">$escapedURL</a>, " .
              lc($lineNumberText{$lang}) .
              "$lineNumber, $characterNumberText{$lang}$character: ";

            if ($error[6]) {
                print escapeHTML($error[6]);
            } else {
                print escapeHTML($error[5]);
            }
            print "</li>\n";
        }

        foreach (@errors) {
            my @error = split(/:/, $_, 6);

            # I don't think this should happen, but I'm not sure
            next if $#error < 4;

            # Determine line number and character of error
            my $lineNumber = $error[2] - $lineAdjust;
            next unless $lineNumber > 0;
            my $character = $error[3] + 1;

            if ($input) {
                my $urlNumber = "";
                if ($multipleURLs) {
                    $urlNumber = "${urlsChecked}-";
                }
                print "<li><a href=\"\#L$urlNumber$lineNumber\" onclick=\"highlight('$urlNumber$lineNumber')\">$lineNumberText{$lang}$lineNumber</a>";
            } else {
                print "<li>$lineNumberText{$lang}$lineNumber";
            }
            print ", $characterNumberText{$lang}$character:\n";

            my $oneChar = ($multibyte{$charset} || '.');

            # Extract relevant section of HTML source.

            # Perl segfaults on the extraction regexp with UTF-8 encoding and
            # very long lines (as seen with http://www.msn.com).  Just skip
            # the extract if conditions are ripe for such a segfault.
            if ($character < 10000 || $oneChar eq '.') {
                my ($line, $preMatch, $maxMatch, $spacesToAdd, $extract, $insertedSpaces, $tabcount, $lineLength);

                $line = superChomp($lines[$lineNumber-1]);
                $lineLength = ulength($line, $oneChar);
                $preMatch = max(0, $character - $extraChars);
                $maxMatch = 2 * $extraChars;

                if ($oneChar eq '.') {
                    $extract = substr($line, $preMatch, $maxMatch);
                } else {
                    ($extract) = ($line =~ /
                        (?:$oneChar)
                        {$preMatch}
                        ((?:$oneChar)
                        {1,$maxMatch})/x);
                }
                $spacesToAdd = $error[3];

                # Expand tabs in the first part of the string to ensure that
                # our character pointer lines up correctly
                ($insertedSpaces, $tabcount) = (0, 0);
                if ($extract =~ /\t/o) {
                    my ($firstPart, $secondPart) =
                        ($extract =~ /^(
                        (?:$oneChar)
                        {0,$spacesToAdd})
                        (.*)$/sx);
                    ($insertedSpaces, $tabcount, $firstPart) = tabExpand($firstPart);
                    $extract = "$firstPart$secondPart";
                    $spacesToAdd = $spacesToAdd - $tabcount + $insertedSpaces;
                }

                if (length($extract) > 0) {

                    $extract = "<code class=html>" . escapeHTML($extract) . "</code>";

                    # Check if the line was truncated for the extract
                    if ($preMatch > 0) {
                        $extract = "... $extract";
                        $spacesToAdd = $extraChars + 3 - $tabcount + $insertedSpaces;
                    }
                    if ($preMatch + $maxMatch < $lineLength) {
                        $extract = "$extract ...";
                    }

                    # Link element names in extract
                    $extract = linkElements($extract);

                    print "<pre>$extract\n";
                    print ' ' x $spacesToAdd;
                    print "<strong>^</strong></pre>\n";
                }
            }

            # Prepare error message, adding emphasis and links where appropriate
            my $errorMsg;
            if ($error[5]) {
                $errorMsg = superChomp(escapeHTML($error[5]));
            } else {
                $errorMsg = superChomp(escapeHTML($error[4]));
            }
            while ($errorMsg =~ m#\{\{(?:&quot;)?(.+?)(?:&quot;)?\}\}#gos) {
                my $linkText = $1;
                my $lcLinkText = lc($linkText);
                if ($links{$lcLinkText}) {
                    $errorMsg =~ s#\{\{(&quot;)?$linkText(&quot;)?\}\}# $1<a href="$links{$lcLinkText}">$linkText</a>$2#;
                } else {
                    $errorMsg =~ s#\{\{(&quot;)?$linkText(&quot;)?\}\}# $1$linkText$2#;
                }
            }

            # Workaround for the incorrect display of the following error:
            #    value of attribute "NOWRAP" cannot be ""; must be one of
            #    "NOWRAP"
            $errorMsg =~ s#&quot;&quot;#""#go;

            $errorMsg =~ s#&quot;(.+?)&quot;#<strong class=html>$1</strong>#g;

            print "<p>";
            if ($error[4] eq 'E' || $error[4] eq 'X') { # Error message
                print "$preError{$lang}";
            } elsif ($error[4] eq 'W') { # warning
                print "$preWarning{$lang}";
            }
            print "$errorMsg</p></li>\n";

        }
        print "</ul>\n";

        if ($maxErrors) {
            print "<p><em>The maximum number of errors was reached. Further errors in the document have not been reported.</em></p>\n";
        }

    } else { # no errors

        print "<p class=congratulations><strong>$noErrors{$lang}</strong></p>\n" unless $hidevalid;

    }

    # Show input if desired
    if ($input && (!$hidevalid || $#errors > -1 || $#externalErrors > -1)) {
        my $cite = "";
        $cite = " cite=\"$outputURL\"" if $outputURL;
        print "<h2>$inputHeading{$lang}</h2>\n<blockquote$cite><pre>";

        my $line;
        my $lineCount = 1;

        # Determine maximum number of digits for a line number
        my $maxNumLength = length($#lines + 1);

        foreach $line (@lines) {

            $line = superChomp($line);

            # Add spaces to right-align line numbers
            my $addedSpaces = $maxNumLength - length($lineCount);
            print ' ' x $addedSpaces;

            my $urlNumber = "";
            if ($multipleURLs) {
                $urlNumber = "${urlsChecked}-";
            }

            print "<span id=line$urlNumber$lineCount><a name=L$urlNumber$lineCount>$lineCount</a>   <code class=html>" . linkElements(escapeHTML($line)) . "</code></span>\n";
            $lineCount++;
        }
        print "</pre></blockquote>\n";
    }

    if ($multipleURLs) {
        $urlsChecked++;
        print "</div>\n" if (!$hidevalid || $#errors > -1 || $#externalErrors > -1);

        # Update list of links to spider
        my $link;
        foreach $link (@documentLinks) {
            if ($link->scheme =~ /^(?:https?)|(?:ftp)$/i) {
                push(@urls, $link);
            }
        }

    }
}

print "<p>Checked $urlsChecked page" . ($urlsChecked != 1 ? "s" : "") . ".</p>\n" if ($multipleURLs);

# Output footer
&printFile("$endingHTML.$lang");


# Fetch a document and return it
# Takes the URL as the first argument
# The URL is assumed to have been checked for basic validity (e.g., that it
# begins with "http://" or "ftp://").
# Calls &error if the document cannot be retrieved
sub getDocument {

    my $url = shift;

    if (!defined $userAgent) {
        if ($spider) {
            $userAgent = SpiderUA->new($spiderUA, $spiderFrom);
            $userAgent->delay(0);
        } else {
            $userAgent = new LWP::UserAgent;
            $userAgent->agent("$normalUA");
        }

        $userAgent->parse_head(0); # We'll parse the HEAD ourselves if needed.
        $userAgent->protocols_allowed(['http','https','ftp']);
    }

    # Prepare request
    my $request = new HTTP::Request 'GET' => $url;
    if (defined $acceptHeader) {
        $request->header(Accept => $acceptHeader);
    }

    # Receive response
    my $response;
    if ($spider) {
        $response = $userAgent->request($request, \&requestCallback);
    } else {
        $response = $userAgent->request($request);
    }

    # Determine URL of document.  This may be different from the original
    # request URL if we were redirected.
    if (defined $response->request) {
        $url = $response->request->url;
    }

    # Check return status
    if ($response->is_success) {

        # Bail out if we're spidering and we found a non-HTML/XML/SGML document
        return 0 if $spider && (!checkContentType($response->content_type)
            || ($response->content_encoding &&
                $response->content_encoding ne 'identity'));

        # Determine character encoding of document
        my $contentType = $response->header('Content-Type');
        my $charset = "";

        if ($contentType && $contentType =~ /[\s;]charset\s*=\s*"?([^,"\s]+)/io) {
            $charset = $1;
        }

        # Grab Last-Modified header
        my $lastModified = $response->header('Last-Modified');

        # Expand found links' URLs to absolute ones if spidering
        if ($spider) {
            # Parse the HEAD to pick up the <base> tag.  We do this
            # explicitly instead of using LWP::UserAgent's parse_head
            # feature because LWP only does the parse for text/html.
            # We need to check additional types such as application/xhtml+xml.
            require HTML::HeadParser;
            my $headParser = HTML::HeadParser->new($response->{'_headers'});
            $headParser->parse($document);
            $headParser->eof;

            my $base = $response->base;
            @documentLinks = map { $_ = URI->new_abs($_, $base); } @documentLinks;
        } else {
            # If we're not spidering, set the document to the content held by
            # the response object.  If we are spidering, the content is stored
            # as it's received and parsed.
            $document = $response->content;
        }

        return ($url, $charset, $lastModified, 1,
            isXMLType($response->content_type));

    } else {
        return 0 if ($spider && $response->message eq 'Forbidden by spider rules'); 

        &printHeader('ISO-8859-1', $lang) unless $multipleURLs;
        &printFile("$beginningHTML.$lang") unless $multipleURLs;
        my $outputURL = escapeHTML($url);
        &error("Error retrieving <a href=\"$outputURL\">$outputURL</a>: " . escapeHTML($response->message));
    }

    return 0;

}

# Return an error message
# Exit unless we have multiple URLs to validate
# The error message must be given as the first argument
sub error {

    my $error_message = shift;

    if ($multipleURLs) {
        print "<div class=checkedDocument><p>$error_message</p></div>\n";
    } else {
        print "<p>$error_message</p>\n";
        &printFile("$endingHTML.$lang");
        exit;
    }
}

# Trim leading and trailing whitespace from a string
# Takes a string as the first argument and returns the new string
sub trim {
    my $str = shift || return;
    $str =~ s/^\s+//go;
    $str =~ s/\s+$//go;
    return $str;
}

# Print HTTP headers
# Optional first argument is the character encoding of the HTML
# document; the default is ISO-8859-1
# Optional second argument is the language of the response
sub printHeader {

    my $characterEncoding = shift || 'ISO-8859-1';
    my $language = shift;

    print "Content-Type: text/html; charset=$characterEncoding";
    print "\015\012"; # CRLF

    print "Content-Language: $language\015\012" if $language;

    print "\015\012"; # CRLF

}

# Print the contents of a file
# The file name must be specified as the first argument
sub printFile {

    my $file = shift || return;

    open(FILE, "$file") || die("Unable to open file: $!");
    while (<FILE>) {
        print "$_";
    }
    close FILE;

}


# Encode unsafe characters in a file URL
sub encodeFileURL {

  my $url = shift || return;

  $url =~ s/ /\%20/go;
  $url =~ s/"/\%22/go;
  $url =~ s/\#/\%23/go;

  return $url;

}

# Return the maximum of two numbers
sub max {
    if ($_[0] > $_[1]) {
        return $_[0];
    } else {
        return $_[1];
    }
}

# Expand tabs in a string
# Return a list of the number of spaces inserted, the number of
# tabs removed, and the expanded string
# (This is a modified version of Text::Tabs::expand.)
sub tabExpand
{
    my @l = @_;

    my $tabstop = 8;
    my $totalSpacesAdded = 0;
    my $totalTabCount = 0;

    for $_ (@l) {
        my $spacesAdded;
        my $tabCount;
        while (s/(^|\n)([^\t\n]*)(\t+)/
            $1. $2 . (" " x 
                ($spacesAdded = ($tabstop * ($tabCount = length($3))
                - (length($2) % $tabstop))))
            /gsex)
        {
            $totalSpacesAdded += $spacesAdded;
            $totalTabCount += $tabCount;
        }
    }

    return ($totalSpacesAdded, $totalTabCount, @l);
}


# Populate the global hash table of links based on the level of HTML
# specified in the first argument and the language specified in the
# second argument
sub loadLinks {

    my $htmlLevel = shift || 'HTML 4.01 Transitional';
    my $lang = shift || 'en';

    my $lfile = $linksFile{"${lang}-$htmlLevel"};
    if ($lfile) {

        open(LINKS, "$lfile") || return;
        # Read links in and populate the hash table
        while (<LINKS>) {
            chomp;
            if (/(.+)\t(.+)/o) {
                $links{$1} = $2;
            }
        }

        close(LINKS);

    }

}


# Link element names in HTML code to the appropriate reference page
# The first argument is the input string
# Returns the string with links inserted
sub linkElements {

    my $code = shift || return;

    while ($code =~ m#&lt;([^\s&]+)#go) {
        my $linkText = $1;
        my $lcLinkText = lc($linkText);
        if ($links{$lcLinkText}) {
            $code =~ s#&lt;$linkText([\s&])#&lt;<a href="$links{$lcLinkText}">$linkText</a>$1#;
        }
    }

    return $code;
}


# Remove any newline characters (\r or \n) at the end of a string
# First argument is the string
# Returns the new string
sub superChomp {

    my $str = shift || return;
    $str =~ s/[\r\n]+$//o;
    return $str;

}


# Return an error message after a bad file upload
sub badUploadError {

    &printHeader('ISO-8859-1', $lang);
    &printFile("$beginningHTML.$lang");
    &error('Your browser does not appear to support form-based file upload. Please try one of our <a href="/tools/validator/">alternate methods of validation</a>.');

}


# Return the number of characters in a potentially-multibyte character string
# First argument is the string
# Second argument is a regular expression denoting a single character
# If the string is single-byte, the second argument should be '.' or omitted
sub ulength {

    my $str = shift || return 0;
    my $oneChar = (shift || '.');

    my $length = 0;

    if ($oneChar eq '.') {  # single-byte

        $length = length($str);

    } else {            # multibyte

        while ($str =~ /$oneChar/gos) {
            $length++;
        }

    }

    return $length;
}

# Callback to sift through extracted links in spider mode
sub linkExtractorCallback {
    foreach (@_) {
        # Remove fragment identifiers
        s/\#.*$//os;
        if ($_) {
            push(@documentLinks, $_);
        }
    }
}

# Return true if we like the Content-Type, false otherwise
# First argument must be the Content-Type, minus any parameters
sub checkContentType {
    my $type = shift;

    return ($type eq 'text/html'
        || $type eq 'text/xml'
        || $type eq 'application/xml'
        || $type eq 'application/xhtml+xml'
        || $type eq 'application/vnd.wap.xhtml+xml'
        || $type eq 'text/sgml'
        || $type eq 'application/sgml'
        || $type eq 'text/vnd.wap.wml'
        || $type eq 'text/x-wap.wml');
}

# Return true if the Content-Type indicates an XML document, false otherwise
# First argument must be the Content-Type, minus any parameters
sub isXMLType {
    my $type = shift;

    return ($xmlMediaTypes{$type} || $type =~ /\+xml$/oi);
}

# Callback to check if a response is an HTML document when spidering
sub requestCallback {
    # Check if request is text/html
    my $contentType = $_[1]->content_type;
    if ($contentType) {
        if (checkContentType($contentType)) {
            $document .= $_[0];
            $parser->parse($_[0]);
        } else {
            die "Resource is not an HTML document";
        }
    }
}

# Create temporary file securely
# Returns the name and file handle of the created file
sub getTempFile {
    my $filename;
    do {
        $filename = POSIX::tmpnam();
    } until sysopen(FH, $filename, O_RDWR|O_CREAT|O_EXCL, 0666);

    return ($filename, \*FH);
}

# Escapes a string for output in HTML
sub escapeHTML {
    my $str = shift;
	$str =~ s/&/&amp;/gos;
	$str =~ s/</&lt;/gos;
	$str =~ s/>/&gt;/gos;
	$str =~ s/"/&quot;/gos;
	return $str;
}

