#!perl -w
#
# Form2WSDL Converter
# March 2005
# by Ross Shannon URL requested: “".$url."”. URL examined: “" .$examinedUrl."” Content-Type: “".$contentType."”.
After following HTTP redirects we were served and examined the page: “" .$examinedUrl."”.
";
&outputFormsInformation;
# Don't output any WSDL if no forms existed on the page
if (scalar(@forms)) {
&outputWSDL;
}
print "
";
&outputOriginalSource;
&outputTidiedSource;
&print_footer();
exit(0);
##################################################################
# Functions
# Fetches document and returns it. Assumes basic validation check has been done on the URL.
sub getDocument {
my $url = shift;
# Configure the UA
$userAgent = LWP::UserAgent->new;
$userAgent->agent("$botUA");
$userAgent->from("$botFrom");
$userAgent->timeout(20);
$userAgent->proxy(['http', 'ftp', 'https'], 'http://proxy5.ucd.ie:8585');
$userAgent->no_proxy('localhost', 'htmlsource',);
# Send request
$request = HTTP::Request->new('GET', $url);
# Get response
$response = $userAgent->request($request);
# Check on the URL we've gotten back. This may be different to the URL was asked for due to redirects etc.
if (defined $response->request) {
$examinedUrl = $response->request->url;
}
# Check how the bot got on
if ($response->is_success) {
# Determine character encoding of document (handy if we need to deal with encoding, erk!)
$contentType = $response->header('Content-Type');
unless (checkContentType($contentType)) {
&print_error("This file was returned to us with the Content-Type “".$response->header('Content-Type')."” which this program does not accept. This commonly happens if you supply the URL of an image or file in a data format that is not a form of SGML. If you are sure that this is not the case, check your server setup to make sure this file is being served correctly.", 1);
}
# Grab Last-Modified header
my $lastModified = $response->header('Last-Modified');
# Store the response text
$document = $response->content;
} else {
# Something's gone wrong with the request
&print_error("Could not get page “$url”. Please check that the address is valid and that the page exists.
The server’s response was: “".$response->status_line."”", 1); } } # Writes fetched page to file, runs it through tidy and outputs the tidied code sub tidyDocument { my $url = shift; # Write page to file open(SOURCEFILE, ">sourcefile.txt"); print SOURCEFILE $document; close SOURCEFILE; # run through tidy, save in place (online: /usr/local/bin/tidy ) open(TIDY, "tidy.exe -f tidyerrors.txt -config tidyconfig.txt -i -wrap 150 -asxhtml -n -m sourcefile.txt 2>&1 |") || &print_warning("Could not tidy the document. This means the markup we’re parsing may not be of optimum quality, but will probably not affect the output."); close(TIDY); } # Parses the page for HTML forms sub extractForms { @forms = HTML::Form->parse(&readFile("sourcefile.txt"), $response->base()); # Find elements that require more complex parsing (enumerated types) # Whisk through the forms on the page and push any complex elements into the 2d array # We'll iterate through that again when we're writing the WSDL my $i = 0; my $j = 0; foreach my $form (@forms) { foreach my $input ($form->inputs) { if ($input->type eq "radio" or $input->type eq "option") { $complexElements[$i][$j++] = $input; } } $i++; $j = 0; } # $complexElements[0][0] now holds the first complex type in the first form, etc } # Creates a parse tree from the HTML file and then traverses is to find useful elements # such as meta refresh commands and labels for form elements sub makeParseTree { my $tree = HTML::TreeBuilder->new(); #empty tree $tree->parse_file("sourcefile.txt"); my $h = $tree; $h->elementify(); # $h should now be my @children = $h->content_list; foreach my $rootel (@children) { if ($rootel->tag() eq "head") { # We've found the
element my @headchildren = $rootel->content_list; foreach my $headel (@headchildren) { if ($headel->tag() eq "title") { my @a = $headel->content_list; $pageTitle = $a[0]; } # Now search for meta refresh elsif ($headel->tag() eq "meta" and $headel->attr('http-equiv') eq "refresh") { $_ = &trim($headel->attr('content')); # Syntax: 5; url=newurl.html /^[0-9]*; url=(.*)/; $metaRefreshUrl = $1; my $pageUrl = URI::URL->new($metaRefreshUrl, $examinedUrl); $metaRefreshUrl = $pageUrl->abs; # Resolve to absolute URL } } } else { $h = $rootel; # Make it the tag. Slight optimization for further traversals } } # Run through the 'work' array ($complexElements) and grab the necessary elements for my $form (0 .. $#complexElements) { for my $elem (0 .. $#{$complexElements[$form]}) { ##print "element $form-$elem is ", $complexElements[$form][$elem]->type, "
"; my $tempelem; my @tempelems; # Get radio values if ($complexElements[$form][$elem]->type eq "radio") { @tempelems = $h->look_down("_tag", "input", "type", "radio", "name", $complexElements[$form][$elem]->name); # This was the bastard causing all the trouble. I was calling the array # in scalar context, unbeknownst to me. Turns out sticking brackets around # the @tempelems array copies it to a new anonymous array which is then # passed by reference. Well, obviously. $complexValues[$form][$elem] = [ @tempelems ]; } # Get select values # Clever hack: instead of treating option values differently to radio buttons # later, we simply add a name attribute to each one, which makes them # behave exactly as a group of radio buttons. if ($complexElements[$form][$elem]->type eq "option") { $tempelem = $h->look_down("_tag", "select", "name", $complexElements[$form][$elem]->name); ##print $tempelem->attr('_tag'); @tempelems = $tempelem->look_down("_tag", "option"); for my $i (0 .. $#tempelems) { $tempelems[$i]->attr("name", $complexElements[$form][$elem]->name); } $complexValues[$form][$elem] = [ @tempelems ]; } ##print "Values for $form-$elem-0, aka ", $complexValues[$form][$elem][0]->attr("name"), " : ", $complexValues[$form][$elem][0]->attr("value"), "
"; } } # Get submit values # Get all forms (again...), as we only want submits from one form at a time my @f = $h->look_down("_tag", "form"); for my $form (0 .. $#forms) { ##print $forms[$form]->action; ##print $f[$form]->attr('action'); my @tempelems = $f[$form]->look_down("_tag", "input", "type", "submit"); my @tempimageelems = $f[$form]->look_down("_tag", "input", "type", "image"); ##print @tempimageelems[0]->attr('type'); push @tempelems, @tempimageelems; @submits[$form] = [ @tempelems ]; } ##print "\n", $submits[0][0]->attr('name'); } # Trim leading and trailing whitespace from a string # Takes a string as the first argument and returns the new string sub trim { my $str = shift || return; $str =~ s/^\s+//go; $str =~ s/\s+$//go; return $str; } # Prints error page. If second argument is 1, the script'll die right here. sub print_error { my $message = shift; my $shoulddie = shift; print "$message
\n"; print "Warning: $message
\n"; print "" . trim($query->escapeHTML($document)) . ""; print "
" . $query->escapeHTML(&readFile("sourcefile.txt")) . ""; print "
There are no forms on this page.
"; } elsif ($fc == 1) { print "There is one form on this page.
"; } else { print "There are ".$fc." forms on this page.
"; } print "\n"; my $i = 1; foreach my $form (@forms) { print "\n".$form->method."
\n"; print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n"; #print "<schema targetNamespace=\"http://$domain.com/schemas\" #xmlns=\"http://www.w3.org/2000/10/XMLSchema\">\n"; #print "</schema>\n"; print "<definitions name=\"".$domain."\"\n targetNamespace=\"urn:".$domain."\"\n xmlns=\"http://schemas.xmlsoap.org/wsdl/\"\n xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\"\n xmlns:soap=\"http://schemas.xmlsoap.org/wsdl/soap/\">\n"; # Types #print "<types>\n"; #print "<!-- no complex types yet -->\n"; #print "</types>\n\n"; # Messages (i.e. form elements) my $i = 1; foreach my $form (@forms) { print "\n<message name=\"Message$i\">\n"; foreach my $input ($form->inputs) { if ($input->type eq "radio" or $input->type eq "select" or $input->type eq "option") { # It's an enumerated type, so we need to work it &writeEnumerated($input->name, $i, $input->type); } else { unless ($input->type eq "submit" or $input->type eq "image") { # It's a simple type, so just fire in the name and work out an xsd type print " <part name=\"".$input->name."\" type=\"xsd:".evaluate_type($input->name, $input->type, $input->value)."\"/>\n" if $input->name and $input->type ne "reset"; } } } # Finally, write submit button choice, if there is one &writeSubmits($i); print "</message>\n"; $i++; } # Output webpage message print "\n<message name=\"MessageOut\">\n"; print " <part name=\"Webpage\" type=\"xsd:string\"/>\n"; print "</message>\n"; # Ports (i.e. operations, i.e. forms) print "\n<portType name=\"".$domain."Port\">\n"; my $i = 1; foreach my $form (@forms) { print " <operation name=\"FormNumber$i\">\n"; print " <input message=\"Message$i\"/>\n"; print " <output message=\"MessageOut\"/>\n"; print " </operation>\n"; $i++; } print "</portType>\n"; # Binding print "\n<binding name=\"".$domain."Binding\" type=\"typens:".$domain."Port\">\n"; print " <soap:binding style=\"rpc\" transport=\"http://schemas.xmlsoap.org/soap/http\"/>\n"; my $i = 1; foreach my $form (@forms) { print " <operation name=\"FormNumber$i\">\n"; print " <soap:operation soapAction=\"urn:".$domain."Action\"/>\n"; print " <input>\n"; print " <soap:body use=\"encoded\" namespace=\"urn:".$domain."\" encodingStyle=\"http://schemas.xmlsoap.org/soap/encoding/\"/>\n"; print " </input>\n"; print " <output>\n"; print " <soap:body use=\"encoded\" namespace=\"urn:".$domain."\" encodingStyle=\"http://schemas.xmlsoap.org/soap/encoding/\"/>\n"; print " </output>\n"; print " </operation>\n"; $i++; } print "</binding>\n"; print "\n<service name=\"".$domain."Service\">\n"; print " <port name=\"".$domain."Port\" binding=\"typens:".$domain."Binding\">\n"; print " <soap:address location=\"http://www.yourhtmlsource.com/projects/Form2WSDL/\n"; print " invoke.pl?url=".$examinedUrl."\"/>\n"; print " </port>\n"; print "</service>\n"; print "\n</definitions>\n"; print ""; } #Writes the WSDL for complex enumerated elements like radio buttons, selects and multi-submits sub writeEnumerated { my $name = shift; my $formNumber = shift; $formNumber--; # we started the count at 1, thus 'formNumber1' etc. my $type = shift; print " <simpleType name=\"$name\">\n"; print " <restriction base=\"xsd:string\">\n"; my $j = 0; while ($complexValues[$formNumber][$j]) { if ($complexValues[$formNumber][$j][0]->attr('name') eq $name) { my $i = 0; while ($complexValues[$formNumber][$j][$i]) { print " <enumeration value=\"",$complexValues[$formNumber][$j][$i]->attr("value"), "\"></enumeration>\n"; $i++; } } $j++; } print " </restriction>\n"; print " </simpleType>\n"; } # Write the submits for a form as an enumeration, IF there's more than one sub writeSubmits { my $formNumber = shift; $formNumber--; # start at 0 # Don't print anything if there's only one submit, as that doesn't represent a choice if ($#{$submits[$formNumber]} > 0) { print "\n <simpleType name=\"submit\">\n"; print " <restriction base=\"xsd:string\">\n"; my $j = 0; while ($submits[$formNumber][$j]) { print " <enumeration value=\"", $submits[$formNumber][$j]->attr('name'), "\"></enumeration>\n"; $j++; } print " </restriction>\n"; print " </simpleType>\n"; } } # Evaluates the type of a message sub evaluate_type { my $name = shift; my $type = shift; my $value = shift; if ($type eq "checkbox" or ($value eq "true" or $value eq "false")) { return "boolean"; } elsif ($value =~ /^[0-9]+$/ and $type ne "radio") { return "int"; } elsif ($value =~ /^[0-9]+.[0-9]+$/ and $type ne "radio") { return "float"; } elsif ($value =~ /^[http https ftp]+:/ and $type ne "password") { return "anyURI"; } elsif ($value =~ /^[0-9]{2,4}[\/ -][0-9]{2}[\/ -][0-9]{2,4}$/) { return "date"; } return "string"; } # Prints standard start to all pages sub print_header { print "Content-type: text/html; charset=ISO-8859-1\n\n"; # if the script is being asked to convert the referring document, we extract this URL now my $badref = 0; if ($url =~ /referr?er/) { if ($ENV{'HTTP_REFERER'}) { $url = $ENV{'HTTP_REFERER'}; } else { $badref = 1; } } print <
Your user agent is “$browser”
Your conversion occured at $timestamp. Thank you and good night.
HERE }