#! /usr/bin/perl
###############################################################################
#
#  Squid-Prefetch (v0.5)
#
#  Written by Brian White <bcwhite@pobox.com>.
#  This program has been placed in the public domain (the only true "free").
#
###############################################################################

use URI;
use Net::HTTP;


@ConfFiles		= qw(./squid-prefetch.conf $ENV{HOME}/.squid-prefetch /etc/squid-prefetch.conf);
$FetchPattern	= 'http://.*(\.(html?|te?xt)|/[^\.]*)';

$AccessFile		= "";
$LastAccess		= 0;
%Config;
%Squid;
%DoneTime;
%DoneCount;
%DoneFetch;
%DonePrefetch;
@DoneList;
@LinkList;



###############################################################################



sub uniq
{
	return () unless @_;

	my($last) = "";
	my(@new)  = ();
	foreach (@_) {
		next unless $_;
		if ($_ ne $last) {
			$last = $_;
			push(@new,$_);
		}
	}
	return @new;
}



sub RandomizeArray
{
	my($array,$count) = @_;
	my $i;

	$count = @$array unless ($count > 0 && $count <= @$array);
	for ($i=$[; $i < $count; $i++) {
		my $random = int(rand($count));
		my $temp;

		$temp 			= $$array[$random];
		$$array[$random]= $$array[$i];
		$$array[$i]		= $temp;
	}

	return scalar @$array;
}



###############################################################################



sub ReadConfig
{
	my($file,$conf) = @_;
	my($parm,$valu);

	open(CONF,"<$file") || die "Error: could not read config file '$file' ($!)\n";
	while (<CONF>) {
		chomp;
		s/\#.*$//;
		next if m/^\s*$/;

		if (($parm,$valu) = m/^(\w+)\s+(.*?)\s*$/) {
			$conf->{$parm} = $valu;
		}
	}
	close(CONF);
}


sub ConfigValue
{
	my($parm,$default) = @_;

	return $Config{$parm} if (exists $Config{$parm});
	return $Squid{$parm}  if (exists $Squid{$parm});
	return $default;
}



###############################################################################



sub ReadAccessLog
{
	my(@pages);

	if (time() - $LastAccess > 900) {
		open(ACCESS,"<$AccessFile") || die "Error: could not read access log $AccessFile ($!)\n";
		seek(ACCESS,0,2); # go to end of file
		$LastAccess = time();
	}

	while (<ACCESS>) {
		$LastAccess = time();
		@_ = split;
		next unless ($_[3] =~ m!/2! && $_[5] eq "GET" && $_[6] =~ m!http://! && $_[9] =~ m!^text/!);
		push @pages,$_[6];
	}

	return @pages;
}



sub FetchUrl
{
	my($url) = @_;
	my($data,@links);
	my($host,$path) = ($url =~ m!http://(?:[^/]+@)?([^/]+)(/.*?)(\#.*)?$!);
	unless ($host && $path) {
		print STDERR "Warning: could not parse URL $url\n";
		return;
	}

	# limit how long we will spend doing the fetch
	local $SIG{ALRM} = sub { die "Error: Timeout fetching $url\n" };
	alarm(3);

	my $http = Net::HTTP->new(PeerHost => $ProxyHost, PeerPort => $ProxyPort, Host => $host, SendTE => 0, KeepAlive => 0);
	$http->write_request("GET" => "http://$host$path", "Accept" => "text/html", "Cache-Control" => "only-if-cached", "User-Agent" => "Squid-Prefetch");
	my ($code,$mesg,%hdrs) = $http->read_response_headers();
	print "\nfetch: $url: $code ($mesg)\n";
	print "  $k  ->  $v\n" while (($k,$v) = each %hdrs);

	alarm(0);

	if ($code != 200) {
		print STDERR "Warning: fetch returned code $code ($mesg) for $url\n";
		return;
	}
	if ($hdrs{"Content-Type"} !~ m!^text/html($|;)!) {
		print STDERR "Warning: fetch returned non-html content-type \"",$hdrs{"Content-Type"},"\" for $url\n";
		return;
	}
	if ($hdrs{"Cache-Control"} =~ m/\bno-cache\b/) {
		print STDERR "Warning: no-cache directive for $url\n";
		return;
	}
	if ($hdrs{"X-Cache"} =~ m/^MISS\b/ || $hdrs{"X-Cache-Lookup"} =~ m/^MISS\b/) {
		print STDERR "Warning: squid didn't cache $url\n";
		return;
	}

	alarm(5);
	while (1) {
		my $bufr;
		my $size = $http->read_entity_body($bufr,4096);
#		print "\n$size : $bufr";
		last unless ($size > 0);

		$data .= $bufr;

		while ($data =~ m!<a[^>]+href\s*=\s*(\"|\'|)([^\"\'>\s]+)\1[^>]*>!gis) {
			my $uri = URI->new($2);
			my $lnk = $uri->abs($url);
			my $frg = ($lnk =~ s/(\#.*)$//); next if ($frg && !$FetchFragments);
			my $opt = ($lnk =~ s/(\?.*)$//); next if ($opt && !$FetchOptions);
			my($lnkh,$lnkp) = ($lnk =~ m!http://(?:[^/]+@)?([^/]+)(/.*)$!);
			next if (exists $DoneTime{$lnk});
			next if ($lnk !~ m!^$FetchPattern$!oi);
			next if ($lnkh ne $host && !$FetchCrossSite);
			print "found $lnk\n";
			unshift @links,$lnk;
		}

		$data =~ s!^.*($|<)!!;
	}
	alarm(0);

	@links = uniq(sort(@links));
	RandomizeArray(\@links);
	push @LinkList,@links;
}



sub PrefetchUrl
{
	my($url) = @_;
	my($total);
	my($host,$path) = ($url =~ m!http://(?:[^/]+@)?([^/]+)(/.*)$!);
	unless ($host && $path) {
		print STDERR "Warning: could not parse URL $url\n";
		return;
	}

	# limit how long we will spend doing the fetch
	local $SIG{ALRM} = sub { die "Error: Timeout fetching $url\n" };
	alarm(3);

	my $http = Net::HTTP->new(PeerHost => $ProxyHost, PeerPort => $ProxyPort, Host => $host, SendTE => 1, KeepAlive => 0);
	$http->write_request("GET" => "http://$host$path", "Accept" => "text/*", "User-Agent" => "Squid-Prefetch");
	my ($code,$mesg,%hdrs) = $http->read_response_headers();
	print "\nprefetch: $url: $code ($mesg)\n";
#	print "  $k  ->  $v\n" while (($k,$v) = each %hdrs);

	alarm(0);

	if ($code != 200) {
		print STDERR "Warning: fetch returned code $code ($mesg) for $url\n";
		return;
	}
	if ($hdrs{"Content-Type"} !~ m!^text/!) {
		print STDERR "Warning: fetch returned non-text content-type \"$hdrs{Content-Type}\" for $url\n";
		return;
	}
	if (exists $hdrs{"Content-Length"} && $hdrs{"Content-Length"} > $FetchMaxSize) {
		print STDERR "Warning: fetch returned oversize content-length ",$hdrs{"Content-Length"}," for $url\n";
		return;
	}

	alarm(5);
	while (1) {
		my $bufr;
		my $size = $http->read_entity_body($bufr,4096);
		last unless ($size > 0);

		$total += $size;
		last if ($total > $FetchMaxSize);
	}
	alarm(0);
}



###############################################################################


# read our config file
foreach $file (@ConfFiles) {
	if (-r $file) {
		ReadConfig($file,\%Config);
	}
}

# read Squid config file
ReadConfig(ConfigValue("squid_config_file","/etc/squid/squid.conf"),\%Squid);

# determine config information
$AccessFile		= ConfigValue("cache_access_log","/var/log/squid/access.log");
$ProxyHost		= ConfigValue("http_proxy","127.0.0.1");
$ProxyPort		= ConfigValue("http_port",3128);
$HistorySize	= ConfigValue("max_history_size",5000);
$HistoryAge		= ConfigValue("max_history_age",24*60*60);
$FetchPattern	= ConfigValue("prefetch_regex",$FetchPattern);
$FetchOptions	= ConfigValue("prefetch_options",0);
$FetchFragments	= ConfigValue("prefetch_fragments",1);
$FetchMaxSize	= ConfigValue("prefetch_maxsize",65536);
$FetchCrossSite	= ConfigValue("prefetch_cross",0);



# prefetch pages
while (1) {
	# read access log
	my @urls = ReadAccessLog();
	my $time = time();
	my @todo = ();

	# determine candidate pages that have recently been fetched
	while (@urls) {
		my $url = shift @urls;
		my $frg = ($url =~ s/(\#.*)$//); next if ($frg);
		my $opt = ($url =~ s/(\?.*)$//); next if ($opt);
		next if ($url !~ m!^$FetchPattern$!oi);

#		print STDERR "Note: user fetch of seen page $url\n" if ($DoneTime{$url} && !$DonePrefetch{$url});

		# remember this URL
		$DoneTime{$url} = $time;
		$DoneCount{$url}++;
		push @DoneList,$url;

		# ignore those pages that appear because we prefetched them
		if (exists $DonePrefetch{$url}) {
			delete $DonePrefetch{$url};
			next;
		}

		# determine if it's age makes it a candidate
		my $age = $DoneFetch{$url};
		next if ($time - $age < $HistoryAge);

		# add it to the todo list
		delete $DoneFetch{$url};
		push @todo,$url.$opt;
	}

	# remember any prefetched pages not found in log (because fetch failed)
	foreach (keys %DonePrefetch) {
		$DoneTime{$_} = $time;
		$DoneCount{$_}++;
		push @DoneList,$_;
		delete $DonePrefetch{$_};
		print STDERR "Warning: no log info for prefetch of $_ (donetime=$DoneTime{$_})\n";
	}

	# keep the todo list down to a reasonable size
	shift @todo while (scalar @todo > 1000);

	# fetch and analyze page from todo list
	while (@todo) {
		my $url = pop @todo;

		# ignore those pages we've already done prefetch for
		next if (exists $DoneFetch{$url});

		# fetch one page and analyze for links (saved to @LinkList)
		$DoneFetch{$url} = $time;
		eval { FetchUrl($url); };
		last;
	}

	# Keep list of links to a reasonable size
	shift @LinkList while (scalar @LinkList > 100);

	# prefetch one link from list
	while (@LinkList) {
		my $url = pop @LinkList;
		next if (exists $DoneTime{$url});

		$DonePrefetch{$url} = $time;
		eval { PrefetchUrl($url); };
		last;
	}

	# reduce the history size to be within limits
	while (scalar @DoneList > $HistorySize) {
		my $url = shift @DoneList;
		if (--$DoneCount{$url} <= 0) {
			print STDERR "Note: removing $url from history...\n";
			delete $DoneCount{$url};
			delete $DoneTime{$url};
			delete $DoneFetch{$url};
			delete $DonePrefetch{$url};
		}
	}

	# wait a moment before starting all over again
	sleep(1);
}
