package SpiderUA;

require LWP::RobotUA;
@ISA = qw(LWP::RobotUA);
$VERSION = "0.9.1";

require HTTP::Response;

use URI ();
use strict;

=head1 NAME

SpiderUA - A class for Web spiders

=head1 SYNOPSIS

  use SpiderUA;
  $ua = SpiderUA->new('my-spider/0.1', 'me@foo.com');
  ...
  # use it just like LWP::RobotUA
  $res = $ua->request($req);

=head1 DESCRIPTION

This class implements a user agent that is suitable for spidering
Web sites.  The class inherits from I<LWP::RobotUA> so that it behaves
as a Web robot should.  Additionally, I<SpiderUA> does not visit the
same URL twice, and it does not visit a URL that is outside the
scope of the original URL.

For example, if the original URL requested is
"http://www.example.com/foo/bar.html", then I<SpiderUA> will return
a 403 response--"Forbidden by spider rules"--for
"http://offsite.example.com/" and "http://www.example.com/baz.html"
while allowing "http://www.example.com/foo/" and
"http://www.example.com/foo/bar/baz.html". 

=head1 METHODS

The SpiderUA is a sub-class of LWP::RobotUA and implements the
same methods. The constructor differs slightly:

=over 4

=cut


=item $ua = SpiderUA->new($agent_name, $from)

Your spider's name and the mail address of the human responsible for
the spider (i.e. you) are required by the constructor.

=cut

sub new
{
    my($class,$name,$from) = @_;

    my $self = new LWP::RobotUA $name, $from;
    $self = bless $self, $class;

    $self;
}

sub request
{
    my($self, $request, $arg, $size) = @_;

    # If this is the first request, we need to set the start_uri, but we
    # don't want to set it until after the call to SUPER::request in case
    # the request is redirected.
    my $firstRequest = 0;
    unless (exists $self->{'first_request_flagged'}) {
        $firstRequest = 1;
        $self->{'first_request_flagged'} = 1;
    }

    my $res = $self->SUPER::request($request, $arg, $size);

    if ($firstRequest) {
        if (defined $res->request) {
            $self->{'start_uri'} = $res->request->url->canonical;
        } else {
            $self->{'start_uri'} = $request->url->canonical;
        }

        if ($self->{'start_uri'}->path =~ m#(.*/)#s) {
            $self->{'start_uri_base_path'} = $1;
        }
    }

    $res;
}

sub simple_request
{
    my($self, $request, $arg, $size) = @_;

    my $uri = $request->url->canonical;
    $request->url($uri);

    if (exists $self->{'start_uri'}) {
        # Check if our spider should visit this URI.  We visit a
        # URI if each of the following is true:
        # 1. the host and port match the starting URI;
        # 2. the URI contains the starting URI's base path;
        # 3. we have not previously visited the URI.

        unless ($uri->host_port eq $self->{'start_uri'}->host_port
            && index($uri->path, $self->{'start_uri_base_path'}) == 0
            && !exists $self->{'visited'}{$uri->as_string})
        {
            return new HTTP::Response
              &HTTP::Status::RC_FORBIDDEN, 'Forbidden by spider rules';
        }
    }

    # Perform the request
    my $res = $self->SUPER::simple_request($request, $arg, $size);

    $self->{'visited'}{$uri->as_string} = 1;

    $res;
}

1;

=back

=head1 SEE ALSO

L<LWP::RobotUA>, L<LWP::UserAgent>

=head1 COPYRIGHT

Copyright 2001-2002 Liam Quinn.

This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut

