#!/usr/bin/perl
use strict;
my ($proxyList, $quiet, $pauseOnCompletion, $dnsInstalled, 
	%options, $progname, $VERSION, @inFiles, $outFile, 
	$nameServer, $sel, $res, $maxQueries, %hostQ, %hostQuerySkt, 
	%count, $noPortOk);
	my $listEmpty;
use Getopt::Long;
use IO::Select;

## remove the '#' from statements below to activate them, or add 
#	one to deactivate them. 

$progname = $0;
$progname =~ s,.*[/\\],,;  # use basename only
$progname =~ s/\.\w*$//; # strip extension, if any

$VERSION = sprintf("%d.%02d", q$Revision: 1.32 $ =~ /(\d+)\.(\d+)/);

my @getopt_args = (
    'h',	# print usage
    'v',	# print version
    'f=s',  # read a configuration file
    'a',    # allow host only (don't require host:port, or host port etc.)
#   'w',    # wait for windows user on completion
    'q',	# no extra info, beeps etc.
	);

Getopt::Long::config("noignorecase", "bundling");
unless (GetOptions(\%options, @getopt_args)) {
    usage();
	}
if ($options{'v'}) {
    my $DISTNAME = 'findProxy ' . $VERSION;
    die <<"EOT";
This is findProxy $VERSION ($DISTNAME)

Author: wayne\@nym.alias.net

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
EOT

};

usage() if $options{'h'} || !@ARGV;

@inFiles = split(/\+/, $ARGV[0]);	#from the command line
$outFile=($ARGV[1] or "STDOUT");	#default to STDOUT

setConfigFileparams();

# some useful defaults if the user doesn't add them on the command 
# line or in the config file

$noPortOk = ($options{'a'} or $noPortOk or 0);	#allowing host instead of 
								#host:port etc. done with port=0
								#if this is not set, we ignore any addresses 
								#with no port. 

$quiet = ($options{'q'} or $quiet or 0);
#$pauseOnCompletion = ($options{'w'} or $pauseOnCompletion or 0); #this prog 
#												#writes to a file, 
#												#so don't need this on
$nameServer=($nameServer or '198.7.0.1'); #alter.net-2
#$nameServer=('194.170.1.7'); #emirates

############### don't modify anything below here  ###############
############### unless you know what you're doing ###############

unless ($dnsInstalled = eval('require Net::DNS'))	{
	print "Not using DNS name resolution because the Net::DNS module\n";
	print "is not installed";
	print "If you use ActiveState Perl, try \n";
	print "ppm install Net-DNS\n";
	print "in a command window\n"};

my %tested=();

my $nrTested=0;
%count=();
for $proxyList (@inFiles) {
#	$count{$proxyList}++;
	open(LIST, "<$proxyList") or die "can't open $proxyList:$!";
	undef $/;	#slurp it in
	my $list=<LIST>."\n";	#make sure there's a new 
					#line for the regex on the end
	close(LIST);
	$/="\n"; # ferkrissake, put it back
	unless ($quiet) {print "$list\n"};

	# This stuff is gross. Note the order is important. 
	# One day I'll do it right, with HTML::Parser
	# At the moment, they're not exactly robust - a small 
	# change to the format of any of these pages  might 
	# screw the regexes up.

	#for http://proxycheck.spylog.ru/
	$list =~ s/<td><font size=-1>\&nbsp;/ /ig;
	#for proxys4all message board
	$list =~ s/\n(\: )+/\n/ig;
	#for http://www8.big.or.jp/~000/CyberSyndrome/ 
	$list =~ s/<li><a href="JavaScript:OpenWindow\('//ig;
	#for http://allfreeweb.hypermart.net/cgi-bin/dbase/db.cgi
	$list =~ s/<TD bgcolor="#999999">//ig;
	$list =~ s/<font face="'Trebuchet MS',Arial,Verdana,sans-serif"//ig;
	$list =~ s/size="2"><b>//ig; 
	#for http://www.hackzone.ru/member/nethack/proxies.htm
	$list =~ s/\r\n        tppabs="http.*?\r\n/\r\n/ig; 
	#for http://www.hackzone.ru/member/nethack/proxies.htm
	$list =~ s/<td bgcolor="#0000\w\w">/ /ig; 
	#for some of MVlads lists
	$list =~ 
		s/(<li>)?<a href="(cgi-bin\/)?fp.pl\?hosts=[^\&]*\&ports=\d+">//ig; 
	$list =~ 
		s|22</a> <a href="shdb.pl\?key=||ig;
	#for http://www.uic.nnov.ru/~visy/internet/eng/sponsor/proxy.htm
	$list =~ s/<\/TD>\r\n\r\n<TD>\r\n<CENTER>/ /ig;	
	$list =~ s/\d+ stat //ig;
	#for http://www.hackzone.ru/member/nethack/proxies.htm
	$list =~ s/<\/td>[\r\n]+        //ig;	
	#for http://ssdd.virtualave.net/all00.html
	$list =~ s/<\/td>[\r]?\n\s+<td>\&nbsp<\/td>[\r]?\n\s+<td>/ /ig; 
	#for http://www.cl.spb.ru/sparta/list1.htm
	$list =~ s/<\/p>\r\n    <p>/\r\n/ig;	
	#for general html lists
	$list =~ s/<LI>//ig;			
	#for general html tables
	$list =~ s/<TR>//ig;  			
	$list =~ s/<[\/]?TD>/ /ig; $list =~ s/<TD ALIGN=RIGHT>//ig;
	#for my posted proxy.pac file format
	$list =~ s/proxies\[proxyNr\+\+\]//ig;	
	#for cyberarmy format
	$list =~ s/port//ig; $list =~ s/<!--\d\d\d\d--><b>//ig;	
	#for proxy.pac again, but maybe elsewhere too
	$list =~ s/["=]//ig; $list =~ s|\n//|\n|ig;		
	#for http://proxy.nikto.net/all_list.htm
	$list =~ s/<br>//ig; $list =~ s/<[\/]?b>/ /ig;	
	#for some posted lists
	$list =~ s/> //ig;
	#for statproxy report list
	$list =~ s/^proxy //ig;

	while 
#	($list =~ m|^\s*([^\s:\(]+\.[^\s:]+)[\s:]+[+-]?(\d+).*[\n\r]+|mg)
	($list =~ m|^\s*([^\s:\(\n\r]+\.[^\s:\n\r]+)[ \t:]*[+-]?(\d*).*?[\n\r]+|mg)
		{
		my ($host, $port) = ($1, $2);
#		print "$host...$port\n";
		if (!$port and $noPortOk) {$port=-1}; #unknown port
		next unless (($host) and ($port));
#			next if ($host =~ /^bess-/);	#:-)
		next if (($port<-1) or ($port>65535) or 
			($host !~ /^[a-zA-Z0-9\.\-]+$/));
		$count{$proxyList}++; #count valid proxies in this file, incl dupes
		next if ($tested{"$host:$port"});
		if ($host) {
			$tested{"$host:$port"}=1;
			$nrTested++;
			};
		};
#	print "total proxy:port entries (before DNS resolution) ";
#	print "after $proxyList: $nrTested\n";
	};
	
	
my %hosts=();	#for dupe checking and sorting
print "\n*********************\n";
print "sortProxy v$VERSION\n";
foreach (keys(%count)) {
	print "valid proxies in $_: $count{$_}\n";
	};
if ($nrTested==0) {
	print "No proxies to sort in the specified list(s)\n";
	print "Maybe there are none there or maybe the format is one\n";
	print "I don't understand - tell wayne\n";
	}
else	{
	if ($dnsInstalled)	{
		$res = new Net::DNS::Resolver;
		$res->nameservers($nameServer); #can't trust the default one
#		$res->usevc(1); #try tcp only
		$res->retrans(2); # retransmission time (default is 5)
		$res->retry(4); # nr of retries (default is 4)
		my $timeout=10; #time within which *no* socket can be read
		$maxQueries=50; #this simply determines the number of sockets which 
						#can get hung up before we can't resolve any more!
						#Windows has a limit (I'm told) of 64, silently 
						#ignoring any more. win9* is probably even worse.
		$sel = new IO::Select();
		#send the first $maxQueries queries
		$listEmpty=0;
		for (1..$maxQueries) {&addQuery; last if ($listEmpty)}; 
		$listEmpty=0;
		while (!$listEmpty) {
			while (my @ready = $sel->can_read($timeout)) {
				foreach my $sock (@ready) {
					my $packet = $res->bgread($sock);
					my $hp=$hostQ{$sock};
	#				print "hp from hash on socket read: $hp\n";
					&processPacket($packet, $res, $hp);
	#				$packet->print;
					$sel->remove($sock);
					$sock = undef;
					&addQuery;
					};
				};
			#when we drop through here, all the available sockets are 
			#probably blocked (and unlikely to ever receive the reply 
			#they're waiting for), so all we can do is reset the whole 
			#lot. Course it's possible they have all completed :-)
			#but that's handled by the listEmpty loop, so we just 
			#clear the sockets here
#			print "timeout: count: ", $sel->count, "\n";
			foreach my $sock ($sel->has_exception(5)) {
				print "socket exceptions encountered!\n";
				};
			for my $sock ($sel->handles) {
				$sel->remove($sock);
				undef($sock);
				for (1..$maxQueries) {&addQuery}; #send the next $maxQueries queries
				};
			};
		}
	else {	#no dns installed, fqdns are just '1'?
		};
	
	};
%count=();
#now sort by IP address (alphabetically) and print
open(OUT, ">$outFile") or die "can't open output file $outFile: $!";
printf OUT "%-16s %-6s%-30s\n", 'host', 'port', 'FQDN';
foreach my $hp (sort keys %hosts)	{
	my ($host, $port)=split(/:/, $hp); 
	$port='?' if ($port==-1); #make sure our output format is parseable 
								#by the other *Proxy programs
	my $hostFqdn=($hosts{$hp} or 'none');
#	printf OUT "%-16s:%-6u%-50s\n", $host, $port, $hostFqdn;
	printf OUT "%-16s:%-6s%-50s\n", $host, $port, $hostFqdn;
	$count{'result'}++;
	};
close OUT;
print "valid, unduped proxies in $outFile: $count{'result'}\n";
print STDOUT "\7\7"; # two beeps to finish even if -q used :-)
#wait for the MS crowd ...
#if ($pauseOnCompletion) {print STDOUT "press enter to exit "; <STDIN> };	
1;

sub processPacket {
	my ($packet, $res, $hp) = @_;
	my ($host, $port)=split(/:/, $hp); 
	my $hostFqdn='';
	if ($packet) {
		foreach my $rr ($packet->answer) {
			my $type=$rr->type;
#			print "$host:$port type $type\n";
			if ($type eq "A")	{
				$hostFqdn.=$host.", ";
				$host=$rr->address;
				}
			elsif ($type eq "PTR")	{
				$hostFqdn.=$rr->ptrdname.", ";
				};
			}
		}
	else {
		if ($host=~/^\d+\.\d+\.\d+\.\d+$/) {
#			$hostFqdn = "dns lookup failed: ".$res->errorstring;
			$hostFqdn='none';	#we have an IP address, but can't 
								#print a fqdn. No problems.
			}
		else {
#			$hostFqdn = $host." dns lookup failed: ".$res->errorstring;
			unless ($res->errorstring =~ /no nameservers/) {
				print "warning: can't resolve $host - removing \n";
				print "from the output file\n";
				$host='';	#we have an fqdn which does not resolve 
							#to an IP address. This one will simply 
							#be removed from the output file
							#this might happen because we're not 
							#connected to the Internet
				};
			};
		};
	undef $packet;
#	print "$host:$port $hostFqdn\n";
	if ($host) {($hosts{"$host:$port"}, undef) = $hostFqdn=~/^(.*?)(, )?$/};
	};

sub addQuery {
#	print $sel->count, " listEmpty: $listEmpty\n";
	return if ($listEmpty);
	return if ($sel->count >= $maxQueries);
	my ($host, $port);
	my ($hp, undef) = each(%tested); #next host from the list
	unless ($hp) {$listEmpty=1; return};
	($host, $port)=split(/:/, $hp);
	my $skt = $res->bgsend($host); #will do PTR if it needs to	
#	print "added: $host:$port\n";
	$sel->add($skt); #this returns the number added, but it's always one here 
	$hostQ{$skt}=$hp;	#keep socket fh so we can get with host:port when 
								#resolver finishes
	};
	
sub setConfigFileParams {
	if ($options{'f'}) { #user specified a configuration file
		open(CONFIG, "<$options{'f'}") or die "can't open the config file";
		my @config=<CONFIG>;
		close(CONFIG);
		foreach (@config) {
			chomp;
			next if (/^#/);
			my ($var,$val) = split(/=>/);
			next if (eval("defined(".$var.")"); #make sure the command line options 
								#override any config file settings
			eval("$var=".$val); #set the variables to config file values
			};
		};
	};


sub usage	{	# command line options
    die <<"EOT";
Usage: $progname [-options] <input file>{[+<input file>]} <output file>
    -v              Show program version
    -h              Print this message
    -a              Allow addresses only (no port required) 
    -f <file>       Read a configuration file (should contain a fast 
                    nameserver address at least)
    -q              Quiet - don't list the proxy list (default noisy)

examples: sortproxy -q largeFile.txt+extraFile.txt outFile.txt
          perl sortproxy.pl -q largeFile.txt+extraFile.txt outFile.txt
EOT

};

__END__

=head1 NAME

sortProxy

=head1 SYNOPSIS

sortProxy [-h] [-v] [-a] [-f file] [-q] [-w] <file name>{[+<file name>]} 
	<output file name>

=head1 OPTIONS

=over 4

=item -v              Show program version
    
=item -h              Print this message

=item -a              Allow list full of addresses only (no port required)
    
=item -q              Quiet - don't list the proxy list, and no beeps etc.
                     (default noisy)
                     
=back

=head1

=head1 DESCRIPTION

This program will merge and sort lists of web proxies in local files. 
Proxies are sorted by IP address, dupes are removed. When run while 
connected to the Internet, it will include domain names in the 
results, otherwise it removes non-numeric addresses.

=head1 REQUIREMENTS:

=over 4

=item * a system with perl (http://www.activestate.com/ for win32 systems, 
http://www.perl.org/ for linux)

=item * perl module Net::DNS 

=back

=head1

=head1 VERSION RELEASE HISTORY

=over 4

=item * 00/12/17 v1.0 initial release

=item * 00/12/18 v1.1 released: DNS reliability improved for non-UAE, 
but maybe speed compromised for this.

=item * 00/12/22 v1.20 released: added configuration file capability 
and parallel DNS lookups. This blazes along (even using a USA DNS 
server)!

=item * 00/12/23 v1.21 released: minor fixes (didn't work with a 
list of less than number of available sockets)

=item * 00/12/28 v1.30 released to beta testers: now handles proxy 
lists with no ports given (option -a)

=item * 00/12/29 v1.31 minor fixes

=item * 00/1/3 v1.32 reorganized command line and config params. 

=back

=head1

=head1 TO RUN IT

=over 4

=item * start it from the command shell by typing:

 perl sortProxy.pl [options] <file>[+<file>]...[+<file>] <outputFile>
 
 Note that any spaces in your filename need to be escaped, 
 or the filename will get munged by the shell before sortProxy even 
 sees  it. You can enclose  the filename in double quotes in MS 
 Windows or single quotes  in Unix.
 
=item * print this documentation in a pretty format by typing:

 perldoc sortProxy.pl
 
=back

=head1 TO DO

=over 4

=item * option to specify that the input format is the findProxy output 
format, so this program can parse the reference time and fqdn. That will 
allow sorting by ref time, or save execution time by using the fqdn 
instead of DNS lookup. Knowing the IP address and fqdn for *all* the 
proxies means that you can sort properly without being connected to 
the Internet as well.

=back

=head1

=head1 KNOWN PROBLEMS

=over 4

=item * none

=back

=head1

=head1 VARIABLES INSIDE THE CODE

$pauseOnCompletion: set this if you don't want the program to 
exit (mainly so that people can double-click this thing 
and keep the command window open at the end).

=head1 AUTHOR

wayne@nym.alias.net (http://www.angelfire.com/wy/1waynes/)

=cut

