#! /usr/local/bin/perl # $Id: metaprox.pl,v 1.91 2004/07/04 14:29:34 user Exp $ =head1 NAME metaprox - an HTTP meta-proxy (i.e., a proxy proxy) =head1 SYNOPSIS metaprox [--port=8338] [--database=/tmp/foo] [--daemon] =head1 DESCRIPTION Are you having problems accessing certain web sites through privacy-enhancing web proxies like squid, ijb and privoxy? Perhaps you'd like to go through proxies for some web sites and not others? If so, you're in luck. This program allows you to do just that. This program works as an HTTP proxy. In other words, you configure your web browser to send requests to it. It examines the request URL and looks up the domain name in an internal table. This table, coupled with a default, determines which proxy, if any, to which it should forward requests. If it should not use a proxy, it forwards it to the ultimate destination directly. This proxy is different than others in that it does not require nor use a configuration file. This proxy recognizes certain HTTP requests as commands for it to execute. Currently it recognizes URLs with any of the hostnames of the machine it is running on, as long as the port number is correct. In other words, if you are running this proxy on your local machine, then telling your web browser to fetch "http://localhost:8338/" will be recognized by metaprox and you will see the metaprox GUI in your browser. You may always use "metaprox" as a special case, even if that is not one of your machine's hostnames. You should avoid configuring any domains as not requiring a proxy in your browser or via the no_proxy environment variable. This program is designed to replace that functionality. This is a win because you can use multiple web browsers and not worry about configuring them individually. =head1 INSTALLATION You will probably need to run the following commands to use this program: # perl -MCPAN -e shell cpan> force install POE cpan> force install POE::Component::Client::HTTP Sadly, I have never gotten POE to install without using "force" keyword. =head1 USE First start the program: yourhost$ ./metaprox.pl --daemon If you have a firewall on this machine, configure it to allow incoming connections to whatever port metaprox listens to (the default is 8338). Next, configure your browser to use the proxy wherever you ran it. For some browsers, you can configure them through environment variables: anyhost$ export http_proxy=http://yourhost:8338/ anyhost$ your_browser If you are launching your browser from a GUI, you may want to put this environment variable in your startup scripts (.profile). To configure metaprox, simply access the following URL: http://metaprox:8338/ =head1 RECREATING DATABASE To store the database, run this command: lynx -source 'http://metaprox:8338/script' > /tmp/foo.sh Then, edit it as desired and run it to restore the database: sh /tmp/foo.sh =head1 TODO Allow picking proxies for portions of a site (match URLs instead of domains). Consider treating default entry more generally as a proxy for domain "". Allow chaining proxies. Allow picking a proxy at random from a list. Determine which HTTP headers are necessary to get a particular page to load. Get SSL working. Automatically determine the capabilities of proxies. =head1 COPYRIGHT Copyright 2004 auto92089@hushmail.com This program is free for non-commercial use. Contact author for terms of commercial use. =cut use strict; use warnings; use integer; # This is the default listen port. my $port = 8338; # This is the default database name. my $database_name = "metaprox.dat"; # Should we detach from the terminal? my $daemon = 0; # Allow a user to override the port on the command line. use Getopt::Long; # Process command-line options. my $result = GetOptions("port=i" => \$port, "database=s" => \$database_name, "daemon" => \$daemon); BEGIN { package ProxyMap; use Storable; sub new { my ($class) = @_; my $self = {}; $self->{"map"} = {}; $self->{"list"} = []; bless $self, $class; return $self; } ## These routines handle the domain-to-proxy mapping. sub add_mapping { my ($self, $domain, $proxy) = @_; my $old_proxy = $self->{"map"}->{$domain}; $self->{"map"}->{$domain} = $proxy; return $old_proxy; } sub get_mapping { my ($self, $domain) = @_; return $self->{"map"}->{$domain}; } sub delete_mapping { my ($self, $domain) = @_; return delete $self->{"map"}->{$domain}; } # Save our state to a file. sub save { my ($self, $file) = @_; store $self, $file; } # Load from an external file. sub load { my ($self, $file) = @_; my $hashref = retrieve($file); %$self = %$hashref; } # Return current state as a human-readable string. sub stringify { my ($self) = @_; use Data::Dumper; return Data::Dumper->Dump([$self], ["self"]); } # Return the current state as an HTML table. sub htmlify { my ($self) = @_; my $html = ""; $html .= "\n"; foreach (sort keys %{$self->{"map"}}) { my $proxy = $self->{"map"}; $html .= "\n"; } $html .= "
Domain NameProxy
$_$proxy
\n"; return $html; } # Get the proxy appropriate for a given domain. # This is guaranteed to return a defined value. # XXX We could store the default as an entry with empty key. sub get_proxy { my ($self, $request_uri) = @_; my $domain = $request_uri->host; # From "www.perl.com", create list ("www.perl.com", "perl.com", "com"). sub parent_domains { my ($domain) = @_; my @parts = split(/\./, $domain); my @doms = (); while (@parts) { push(@doms, join('.', @parts)); shift(@parts); } return(@doms); } my $proxy; foreach (parent_domains($domain)) { defined($proxy = $self->get_mapping($_)) and return $proxy; } return $self->get_default; } # Return domain:proxy pairs in a reasonable order. sub dump_sorted_mapping { my ($self) = @_; my @mapping = (); foreach my $key (sort keys %{$self->{"map"}}) { push(@mapping, [ $key, $self->{"map"}->{$key} ]); } return @mapping; } ## These routines manage the proxy list. sub add_list { my ($self, $proxy) = @_; return if scalar(grep { $_ eq $proxy } @{$self->{"list"}}); push(@{$self->{"list"}}, $proxy); } sub del_list { my ($self, $proxy) = @_; @{$self->{"list"}} = grep { $_ ne $proxy } @{$self->{"list"}}; } sub dump_list { my ($self) = @_; return @{$self->{"list"}}; } ## These routines control the default proxy. # The HTTP proxy to be used by the HTTP Client is set on every HTTP # request, so we need not do anything here. sub set_default { my ($self, $proxy) = @_; $self->{"default"} = $proxy; } sub get_default { my ($self) = @_; return $self->{"default"}; } } my $proxy_map = ProxyMap->new(); # Daemonize self if not in debug mode. if ($daemon) { my $r = fork; defined($r) or die "could not fork: $!\n"; exit 0 if $r; # TODO: setsid(); chdir("/") or die "could not chdir to /: $!\n"; close(STDIN) or die "could not close STDIN: $!\n"; close(STDOUT) or die "could not close STDOUT: $!\n"; # NB: Uncommenting the following line causes the first client request # to not have its file descriptor closed. Both the POE kernel # and the TCP server sometimes write to STDERR. # close(STDERR) or die "could not close STDERR: $!\n"; } use POE qw(Component::Client::HTTP Component::Server::TCP Filter::HTTPD); use HTTP::Response; # Spawn a web client to fetch requests through. POE::Component::Client::HTTP->spawn( Alias => 'ua' ); # Load or initialize the ProxyMap. if (-r $database_name) { $proxy_map->load($database_name); } else { # Break encapsulation to save the default proxy. my $ua_session = $poe_kernel->alias_resolve("ua"); my $ua_heap = $ua_session->get_heap(); my $proxy_list = $ua_heap->{"proxy"}; if (defined($proxy_list)) { # Arbitrarily pick the first one. my $encoded_proxy = $$proxy_list[0]; my $uri = URI->new(); $uri->scheme("http"); $uri->host($$encoded_proxy[0]); $uri->port($$encoded_proxy[1]); $uri->path("/"); $proxy_map->set_default($uri); } else { $proxy_map->set_default(""); } # Break encapsulation to pre-load any no_proxy sites. my $no_proxy = $ua_heap->{"no_proxy"}; if (defined($no_proxy)) { foreach (@$no_proxy) { $proxy_map->add_mapping($_, ""); } } } # Spawn a web server. POE::Component::Server::TCP->new ( Alias => "web_server", Port => $port, ClientFilter => 'POE::Filter::HTTPD', ClientInput => \&handle_http_request, InlineStates => { got_response => \&handle_http_response }, ); POE::Kernel->run(); $proxy_map->save($database_name); exit 0; # This package implements the client interface to the proxy. BEGIN { package ProxyCommand; # Dispatch table for commands sent by client. my $commands = { "add" => \&cmd_add, "query" => \&cmd_query, "delete" => \&cmd_delete, "config" => \&cmd_config, "" => \&cmd_config, "use" => \&cmd_use, "add_proxy" => \&cmd_add_proxy, "del_proxy" => \&cmd_del_proxy, "dump" => \&cmd_dump, "script" => [ "text/plain", \&cmd_script ], }; sub cmd_add { my (%q) = @_; # XXX Make this a nice error page. (defined($q{"domain"}) and defined($q{"proxy"})) or die "add: not enough arguments @_"; # Update the global mapping. $proxy_map->add_mapping($q{"domain"}, $q{"proxy"}); return cmd_config() . "
Added proxy \"" . $q{"proxy"} .
	    "\" for domain " . $q{"domain"} . "
\n"; } sub cmd_query { return $proxy_map->htmlify(); } sub cmd_delete { my (%q) = @_; (defined($q{"domain"})) or die "del: not enough arguments @_"; # This is the old proxy, defined if one actually existed. my $old_proxy = $proxy_map->delete_mapping($q{"domain"}); my $html = cmd_config(); if (defined($old_proxy)) { $html .= "
Deleted proxy \"$old_proxy\" for "
		. $q{"domain"} . "
\n"; } return $html; } sub cmd_use { my (%q) = @_; (defined($q{"proxy"})) or die "use: not enough arguments @_"; my $proxy = $q{"proxy"}; $proxy_map->set_default($proxy); return cmd_config(); } sub cmd_add_proxy { my (%q) = @_; die "add_proxy: not enough arguements @_" unless defined $q{"proxy"}; $proxy_map->add_list($q{"proxy"}); $proxy_map->set_default($q{"proxy"}); return cmd_config(); } sub cmd_del_proxy { my (%q) = @_; die "del_proxy: not enough arguments @_" unless defined $q{"proxy"}; $proxy_map->del_list($q{"proxy"}); return cmd_config() . "
Proxy deleted from list.
"; } # Dump a series of URLs, which if followed, would re-create the # current state of the database. sub cmd_dump { my $html = ""; my @plist = $proxy_map->dump_list; $html .= join("
\n", map { "$_" } @plist); $_ = $proxy_map->get_default(); $html .= "
\n$_
\n"; foreach ($proxy_map->dump_sorted_mapping) { my ($dom, $prox) = @$_; $html .= ""; $html .= "proxy for $dom
\n"; } return $html; } sub cmd_script { my $text = "#! /bin/sh\n"; $text .= "CMD=\"lynx -source\"\n"; $text .= "NULL=/dev/null\n"; $text .= "BASE=\"http://metaprox:8338/\"\n\n"; foreach ($proxy_map->dump_list) { $text .= "\$CMD \$BASE\'add_proxy?proxy=$_\' >> \$NULL\n"; } $_ = $proxy_map->get_default; $text .= "\$CMD \$BASE\'use?proxy=$_\' >> \$NULL\n"; foreach ($proxy_map->dump_sorted_mapping) { my ($dom, $prox) = @$_; $text .= "\$CMD \$BASE\'add?proxy=$prox&domain=$dom\' >> \$NULL\n"; } return $text; } # Show internal state of the ProxyMap and give a web-based UI # for the other commands. sub cmd_config { my $html = ""; $html .= "\nDefault proxy: " . $proxy_map->get_default(); $html .= "
\nProxy List:\n
"; my @plist = $proxy_map->dump_list; $html .= join(' ', map { "$_" } @plist); $html .= "\n
Add and Use Proxy:\n
"; $html .= <<"EOF";
EOF $html .= "\n
\n"; $html .= ""; $html .= "\n"; foreach ([ "Last URL", $ProxyCommand::last_request ], [ "Last HTML", $ProxyCommand::last_html ], [ "Last Clicked", $ProxyCommand::last_clicked ]) { if (defined $_->[1]) { my $domain = $_->[1]->host; $html .= "\n"; $html .= <<"EOF"; EOF } } $html .= "
DescriptionDomainProxyCommand
$_->[0]$domain
\n"; $html .= "\n
\n
"; $html .= ""; $html .= <<"EOF"; EOF foreach ($proxy_map->dump_sorted_mapping) { my ($dom, $prox) = @$_; my $appearance = ($prox eq "") ? "[no proxy]" : $prox; $html .= <<"EOF"; EOF } $html .= "
DomainProxyCommand
$dom $appearance
"; return $html; } # Note that this is a class method, not an instance method. sub command { my ($request) = @_; my $req_uri = $request->uri; # Get pathname (stuff after port but before question mark). my $cmd = $req_uri->path; # Strip the leading slash. $cmd =~ s!^/!!; # Prepare most of a response. my $html_hdr = "MetaProx\n\n"; my $html_ftr = ""; # Create a response for this command. my $response = HTTP::Response->new(); use HTTP::Status; my $command_descriptor = $commands->{$cmd}; my $command_type = "text/html"; # default type my $command_ref = $command_descriptor; # default my $text; # full response # Process any commands we have received. if (defined $command_descriptor) { # If the command descriptor is an array, the first element # contains its type (usually text/plain). if (ref $command_descriptor eq 'ARRAY') { $command_type = $$command_descriptor[0]; $command_ref = $$command_descriptor[1]; } # Get all the name/value pairs from after the question mark. # If the user repeats a key with different values, # this might give either answer (it's undefined). # So don't do that. my %q = $req_uri->query_form; my $body = &$command_ref(%q); if ($command_type eq "text/html") { $text = $html_hdr . $body . $html_ftr; } else { $text = $body; } $response->code(RC_OK); $response->message("Command Accepted"); } else { $text = $html_hdr . "Invalid command." . $html_ftr; $response->code(RC_METHOD_NOT_ALLOWED); $response->message("Command Rejected"); } $response->header('Content-Type' => $command_type); $response->content($text); return $response; } } # Everything from here down is POE stuff. # Handle HTTP requests from the client. Pass them to the HTTP # client component for further processing. Optionally dump the # request as text to STDOUT. sub handle_http_request { my ( $kernel, $heap, $request ) = @_[ KERNEL, HEAP, ARG0 ]; # If the request is really a HTTP::Response, then it indicates a # problem parsing the client's request. Send the response back so # the client knows what's happened. if ( $request->isa("HTTP::Response") ) { $heap->{client}->put($request); $kernel->yield("shutdown"); return; } sub get_valid_host_names { my ($name) = @_; my ($fullname,$aliases,$addrtype,$length,@addrs) = gethostbyname $name; return ("localhost", "metaprox", $fullname, split ' ', $aliases); } # TODO: Avoid using external programs. my $hostname = `uname -n`; chomp $hostname; my @hostnames = get_valid_host_names($hostname); my $req_uri = $request->uri; # Some browsers send relative URIs when accessing metaprox. my $base_uri = URI->new(); $base_uri->scheme("http"); $base_uri->host_port($request->header("Host")); # Convert those relative URIs into an absolute one. my $r = $req_uri->abs($base_uri); sub is_in { my $el = shift; return scalar(grep $_ eq $el, @_); } # Find out if it was a command to this proxy. # If we were sent a relative URL, the browser knows this is not a proxy. # Otherwise, try to match all parts of the URL. # NOTE: Do not use != as == is magical and detects relative URIs. if (!($r == $req_uri) or ($r->scheme eq "http" and $r->port eq $port and is_in($r->host, @hostnames))) { my $response = ProxyCommand::command($request); # Return $reponse to the client instead of fetching anything. $heap->{client}->put($response); $kernel->yield("shutdown"); return; } # Client::HTTP doesn't support keep-alives yet. $request->header( "Connection", "close" ); $request->header( "Proxy-Connection", "close" ); $request->remove_header("Keep-Alive"); # See what proxy we should use. # XXX Should we set the proxy only if it has changed? my $proxy = $proxy_map->get_proxy($r); # Break encapsulation to access the specified proxy. my $ua_session = $kernel->alias_resolve("ua"); my $ua_heap = $ua_session->get_heap(); # If we should use a particular proxy for this domain, do it. if ($proxy eq "") { # If specified as an empty string, use no proxy at all. delete $ua_heap->{"proxy"}; } else { # If user specified a proxy, use that instead of default. my $proxy_uri = URI->new($proxy); # This is a listref of proxies, each being a listref. $ua_heap->{"proxy"} = [ [ $proxy_uri->host, $proxy_uri->port ] ]; } $kernel->post( "ua" => "request", "got_response", $request ); } # Handle HTTP responses from the POE::Component::Client::HTTP we've # spawned at the beginning of the program. Send each response back # to the client that requested it. Optionally display the response # as text. sub handle_http_response { my ( $kernel, $heap ) = @_[ KERNEL, HEAP ]; my $http_request = $_[ARG0]->[0]; my $http_response = $_[ARG1]->[0]; # I should probably use get/set methods but I'm lazy. $ProxyCommand::last_request = $http_request->uri; $ProxyCommand::last_html = $http_request->uri if $http_response->content_type eq "text/html"; $ProxyCommand::last_clicked = $http_request->uri if defined $http_request->referer; # Avoid sending the response if the client has gone away. $heap->{client}->put($http_response) if defined $heap->{client}; # Shut down the client's connection when the response is sent. $kernel->yield("shutdown"); }