#!perl -w # # Form2WSDL Converter # March 2005 # by Ross Shannon # ################################################################################ # Pull in required Perl modules use strict; use CGI; use URI::URL; use CGI qw(:all); use CGI::Carp qw(fatalsToBrowser); use LWP::UserAgent; use HTML::Form; use HTML::TreeBuilder; use HTML::Element; ################################################################################ # Set constants and initialise global variables my $doc_root = $ENV{'DOCUMENT_ROOT'}; # filesystem location my $timestamp = scalar localtime(); # report timestamp # User-Agent string and contact address for robot my $version = "0.9"; my $botUA = "Form2WSDL/$version (http://www.yourhtmlsource.com/projects/Form2WSDL/)"; my $botFrom = 'ross.shannon@gmail.com'; # A little metadata... my $browser = $ENV{'HTTP_USER_AGENT'}; ################################################################################ # Variables, variables everywhere my $userAgent; my $document; my $request; my $response; my @forms; my @complexElements; my @complexValues; my @submits; my $parsetree; ################################################################################ # Get the data sent to this script from the form my $query = new CGI; # URL of document to check my $url = &trim($query->param('url')); # Eventual URL, taking redirects into account my $examinedUrl; my $contentType; my $pageTitle; my $metaRefreshUrl; ################################################################################ # Do some processing &print_header(); # First check if it's URL input. Most probably will be, but we'll leave the door open. if ($url) { # Check if URL is valid if ($url =~ m#(?:https?://)|(?:ftp://)\S#i) { #URL is valid (enough), go ahead &getDocument($url); &tidyDocument($url); &extractForms($url); &makeParseTree($url); } else { # URL is invalid, print error message and exit. &print_error("Invalid URL: “" . $query->escapeHTML($url) . "”", 1); } } else { &print_error("No URL specified.", 1); } ################################################################################ # Start Output. We'll only get here if there haven't been any 'quickdeath' errors so far. # Primary metadata print "
"; unless ($url eq $examinedUrl) { print "

URL requested: “".$url."”.
After following HTTP redirects we were served and examined the page: “" .$examinedUrl."”.

"; } else { print "

URL examined: “" .$examinedUrl."

"; } if ($metaRefreshUrl) { print_warning("This page was set up to automatically redirect to “$metaRefreshUrl” after a number of seconds. It’s recommended that you also convert that page."); } print "Page Title: “$pageTitle”" if $pageTitle; ##print "

Content-Type: “".$contentType."”.

"; print "
"; print "
"; &outputFormsInformation; # Don't output any WSDL if no forms existed on the page if (scalar(@forms)) { &outputWSDL; } print "
"; &outputOriginalSource; &outputTidiedSource; &print_footer(); exit(0); ################################################################## # Functions # Fetches document and returns it. Assumes basic validation check has been done on the URL. sub getDocument { my $url = shift; # Configure the UA $userAgent = LWP::UserAgent->new; $userAgent->agent("$botUA"); $userAgent->from("$botFrom"); $userAgent->timeout(20); $userAgent->proxy(['http', 'ftp', 'https'], 'http://proxy5.ucd.ie:8585'); $userAgent->no_proxy('localhost', 'htmlsource',); # Send request $request = HTTP::Request->new('GET', $url); # Get response $response = $userAgent->request($request); # Check on the URL we've gotten back. This may be different to the URL was asked for due to redirects etc. if (defined $response->request) { $examinedUrl = $response->request->url; } # Check how the bot got on if ($response->is_success) { # Determine character encoding of document (handy if we need to deal with encoding, erk!) $contentType = $response->header('Content-Type'); unless (checkContentType($contentType)) { &print_error("This file was returned to us with the Content-Type “".$response->header('Content-Type')."” which this program does not accept. This commonly happens if you supply the URL of an image or file in a data format that is not a form of SGML. If you are sure that this is not the case, check your server setup to make sure this file is being served correctly.", 1); } # Grab Last-Modified header my $lastModified = $response->header('Last-Modified'); # Store the response text $document = $response->content; } else { # Something's gone wrong with the request &print_error("Could not get page “$url”. Please check that the address is valid and that the page exists.

\n

The server’s response was: “".$response->status_line."”", 1); } } # Writes fetched page to file, runs it through tidy and outputs the tidied code sub tidyDocument { my $url = shift; # Write page to file open(SOURCEFILE, ">sourcefile.txt"); print SOURCEFILE $document; close SOURCEFILE; # run through tidy, save in place (online: /usr/local/bin/tidy ) open(TIDY, "tidy.exe -f tidyerrors.txt -config tidyconfig.txt -i -wrap 150 -asxhtml -n -m sourcefile.txt 2>&1 |") || &print_warning("Could not tidy the document. This means the markup we’re parsing may not be of optimum quality, but will probably not affect the output."); close(TIDY); } # Parses the page for HTML forms sub extractForms { @forms = HTML::Form->parse(&readFile("sourcefile.txt"), $response->base()); # Find elements that require more complex parsing (enumerated types) # Whisk through the forms on the page and push any complex elements into the 2d array # We'll iterate through that again when we're writing the WSDL my $i = 0; my $j = 0; foreach my $form (@forms) { foreach my $input ($form->inputs) { if ($input->type eq "radio" or $input->type eq "option") { $complexElements[$i][$j++] = $input; } } $i++; $j = 0; } # $complexElements[0][0] now holds the first complex type in the first form, etc } # Creates a parse tree from the HTML file and then traverses is to find useful elements # such as meta refresh commands and labels for form elements sub makeParseTree { my $tree = HTML::TreeBuilder->new(); #empty tree $tree->parse_file("sourcefile.txt"); my $h = $tree; $h->elementify(); # $h should now be my @children = $h->content_list; foreach my $rootel (@children) { if ($rootel->tag() eq "head") { # We've found the element my @headchildren = $rootel->content_list; foreach my $headel (@headchildren) { if ($headel->tag() eq "title") { my @a = $headel->content_list; $pageTitle = $a[0]; } # Now search for meta refresh elsif ($headel->tag() eq "meta" and $headel->attr('http-equiv') eq "refresh") { $_ = &trim($headel->attr('content')); # Syntax: 5; url=newurl.html /^[0-9]*; url=(.*)/; $metaRefreshUrl = $1; my $pageUrl = URI::URL->new($metaRefreshUrl, $examinedUrl); $metaRefreshUrl = $pageUrl->abs; # Resolve to absolute URL } } } else { $h = $rootel; # Make it the tag. Slight optimization for further traversals } } # Run through the 'work' array ($complexElements) and grab the necessary elements for my $form (0 .. $#complexElements) { for my $elem (0 .. $#{$complexElements[$form]}) { ##print "

element $form-$elem is ", $complexElements[$form][$elem]->type, "

"; my $tempelem; my @tempelems; # Get radio values if ($complexElements[$form][$elem]->type eq "radio") { @tempelems = $h->look_down("_tag", "input", "type", "radio", "name", $complexElements[$form][$elem]->name); # This was the bastard causing all the trouble. I was calling the array # in scalar context, unbeknownst to me. Turns out sticking brackets around # the @tempelems array copies it to a new anonymous array which is then # passed by reference. Well, obviously. $complexValues[$form][$elem] = [ @tempelems ]; } # Get select values # Clever hack: instead of treating option values differently to radio buttons # later, we simply add a name attribute to each one, which makes them # behave exactly as a group of radio buttons. if ($complexElements[$form][$elem]->type eq "option") { $tempelem = $h->look_down("_tag", "select", "name", $complexElements[$form][$elem]->name); ##print $tempelem->attr('_tag'); @tempelems = $tempelem->look_down("_tag", "option"); for my $i (0 .. $#tempelems) { $tempelems[$i]->attr("name", $complexElements[$form][$elem]->name); } $complexValues[$form][$elem] = [ @tempelems ]; } ##print "

Values for $form-$elem-0, aka ", $complexValues[$form][$elem][0]->attr("name"), " : ", $complexValues[$form][$elem][0]->attr("value"), "

"; } } # Get submit values # Get all forms (again...), as we only want submits from one form at a time my @f = $h->look_down("_tag", "form"); for my $form (0 .. $#forms) { ##print $forms[$form]->action; ##print $f[$form]->attr('action'); my @tempelems = $f[$form]->look_down("_tag", "input", "type", "submit"); my @tempimageelems = $f[$form]->look_down("_tag", "input", "type", "image"); ##print @tempimageelems[0]->attr('type'); push @tempelems, @tempimageelems; @submits[$form] = [ @tempelems ]; } ##print "\n", $submits[0][0]->attr('name'); } # Trim leading and trailing whitespace from a string # Takes a string as the first argument and returns the new string sub trim { my $str = shift || return; $str =~ s/^\s+//go; $str =~ s/\s+$//go; return $str; } # Prints error page. If second argument is 1, the script'll die right here. sub print_error { my $message = shift; my $shoulddie = shift; print "
\n

Error

\n\n"; print "

$message

\n"; print "
"; if ($shoulddie == 1) { &print_footer(); exit; } } # Prints amber warning for errors requiring relatively minor attention. sub print_warning { my $message = shift; print "
\n"; print "

Warning: $message

\n"; print "
"; } # Checks if returned content-type is part of a list we'll accept, false otherwise sub checkContentType { my $ctype = shift; return ($ctype =~ /text\/html/ || $ctype =~ /text\/xml/ || $ctype =~ /application\/xml/ || $ctype =~ /application\/xhtml\+xml/ || $ctype =~ /text\/sgml/ || $ctype =~ /application\/sgml/); } # Print the contents of a file sub readFile { my $file = shift || return; my $input = ""; open(FILE, "$file") || die("Unable to open file: $!"); while () { $input .= "$_"; } close FILE; return $input; } ######################################################################### # Output Functions # # Outputs the original source code sub outputOriginalSource { print "

Original Source Listing

"; print "

"; print "
"; print "
" . trim($query->escapeHTML($document)) . "
"; print "
"; #Free up memory undef($document); } # Outputs the file after tidy has been run on it sub outputTidiedSource { #print $query->escapeHTML(&readFile("tidyerrors.txt")); print "

Tidied Source Listing

"; print "

"; print "
"; print "
" . $query->escapeHTML(&readFile("sourcefile.txt")) . "
"; print "
"; } # Outputs details about the page's forms sub outputFormsInformation { print "\n\n

Form Information

\n"; # Count the number of forms on the page my $fc = scalar(@forms); if ($fc == 0) { print "

There are no forms on this page.

"; } elsif ($fc == 1) { print "

There is one form on this page.

"; } else { print "

There are ".$fc." forms on this page.

"; } print "\n"; my $i = 1; foreach my $form (@forms) { print "\n
\n"; print "
$i.  Action: ".$form->action."
\n"; print "
Method: ".$form->method."
\n"; ##print "
Encoding: ".$form->enctype."
\n"; $i++; print "
Inputs: ".$form->inputs."\n
    \n"; # input count foreach my $input ($form->inputs) { print "
  1. "; print "Name: “".$input->name."” \n" if $input->name; print "Type: “".$input->type."”\n"; print " Value: “".$input->value."”\n" if $input->value; print "
  2. "; } print "
\n"; } print "\n\n\n"; } # Outputs the WSDL for this page sub outputWSDL { # Rip apart the URL until we have the domain name sans extension, which we can then use to name things $url =~ m|(\w+)://([^/:]+)(:\d+)?/(.*)|; my $domainName = $2; $domainName =~ m/(www\.)?([^\.]+)(\.com|\.net|\.org)/; my $domain = $2; print "\n

WSDL Output

\n"; print "
\n";
  print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n";
  #print "<schema targetNamespace=\"http://$domain.com/schemas\"
       #xmlns=\"http://www.w3.org/2000/10/XMLSchema\">\n";
  #print "</schema>\n";
  
  print "<definitions name=\"".$domain."\"\n  targetNamespace=\"urn:".$domain."\"\n  xmlns=\"http://schemas.xmlsoap.org/wsdl/\"\n  xmlns:xsd=\"http://www.w3.org/2001/XMLSchema\"\n  xmlns:soap=\"http://schemas.xmlsoap.org/wsdl/soap/\">\n";
  
  # Types
  #print "<types>\n";
  #print "<!-- no complex types yet -->\n";
  #print "</types>\n\n";
  
  # Messages (i.e. form elements)
  my $i = 1;
  foreach my $form (@forms) {
    print "\n<message name=\"Message$i\">\n";
    foreach my $input ($form->inputs) {
    
      if ($input->type eq "radio" or $input->type eq "select" or $input->type eq "option") {
        # It's an enumerated type, so we need to work it
        &writeEnumerated($input->name, $i, $input->type);
        
      } else {
        unless ($input->type eq "submit" or $input->type eq "image") { 
          # It's a simple type, so just fire in the name and work out an xsd type
          print "  <part name=\"".$input->name."\" type=\"xsd:".evaluate_type($input->name, $input->type, $input->value)."\"/>\n" if $input->name and $input->type ne "reset";
        }
      }
    }
    # Finally, write submit button choice, if there is one
    &writeSubmits($i);
    print "</message>\n";
    $i++;
  }
  # Output webpage message
  print "\n<message name=\"MessageOut\">\n";
  print "  <part name=\"Webpage\" type=\"xsd:string\"/>\n";
  print "</message>\n";
  
  # Ports (i.e. operations, i.e. forms)
  print "\n<portType name=\"".$domain."Port\">\n";
  
  my $i = 1;
  foreach my $form (@forms) {
    print "  <operation name=\"FormNumber$i\">\n";
    print "    <input message=\"Message$i\"/>\n";
    print "    <output message=\"MessageOut\"/>\n";
    print "  </operation>\n";
    $i++;
  }
  print "</portType>\n";
  
  # Binding
  print "\n<binding name=\"".$domain."Binding\" type=\"typens:".$domain."Port\">\n";
  print "  <soap:binding style=\"rpc\" transport=\"http://schemas.xmlsoap.org/soap/http\"/>\n";
  my $i = 1;
  foreach my $form (@forms) {
    print "  <operation name=\"FormNumber$i\">\n";
    print "    <soap:operation soapAction=\"urn:".$domain."Action\"/>\n";
    print "    <input>\n";
    print "      <soap:body use=\"encoded\" namespace=\"urn:".$domain."\" encodingStyle=\"http://schemas.xmlsoap.org/soap/encoding/\"/>\n";
    print "    </input>\n";
    print "    <output>\n";
    print "      <soap:body use=\"encoded\" namespace=\"urn:".$domain."\" encodingStyle=\"http://schemas.xmlsoap.org/soap/encoding/\"/>\n";
    print "    </output>\n";
    print "  </operation>\n";
    $i++;
  }
  print "</binding>\n";
  
  print "\n<service name=\"".$domain."Service\">\n";
  print "  <port name=\"".$domain."Port\" binding=\"typens:".$domain."Binding\">\n";
  print "    <soap:address location=\"http://www.yourhtmlsource.com/projects/Form2WSDL/\n";  
  print "         invoke.pl?url=".$examinedUrl."\"/>\n";
  print "  </port>\n";
  print "</service>\n";
  
  print "\n</definitions>\n";
  print "
"; } #Writes the WSDL for complex enumerated elements like radio buttons, selects and multi-submits sub writeEnumerated { my $name = shift; my $formNumber = shift; $formNumber--; # we started the count at 1, thus 'formNumber1' etc. my $type = shift; print " <simpleType name=\"$name\">\n"; print " <restriction base=\"xsd:string\">\n"; my $j = 0; while ($complexValues[$formNumber][$j]) { if ($complexValues[$formNumber][$j][0]->attr('name') eq $name) { my $i = 0; while ($complexValues[$formNumber][$j][$i]) { print " <enumeration value=\"",$complexValues[$formNumber][$j][$i]->attr("value"), "\"></enumeration>\n"; $i++; } } $j++; } print " </restriction>\n"; print " </simpleType>\n"; } # Write the submits for a form as an enumeration, IF there's more than one sub writeSubmits { my $formNumber = shift; $formNumber--; # start at 0 # Don't print anything if there's only one submit, as that doesn't represent a choice if ($#{$submits[$formNumber]} > 0) { print "\n <simpleType name=\"submit\">\n"; print " <restriction base=\"xsd:string\">\n"; my $j = 0; while ($submits[$formNumber][$j]) { print " <enumeration value=\"", $submits[$formNumber][$j]->attr('name'), "\"></enumeration>\n"; $j++; } print " </restriction>\n"; print " </simpleType>\n"; } } # Evaluates the type of a message sub evaluate_type { my $name = shift; my $type = shift; my $value = shift; if ($type eq "checkbox" or ($value eq "true" or $value eq "false")) { return "boolean"; } elsif ($value =~ /^[0-9]+$/ and $type ne "radio") { return "int"; } elsif ($value =~ /^[0-9]+.[0-9]+$/ and $type ne "radio") { return "float"; } elsif ($value =~ /^[http https ftp]+:/ and $type ne "password") { return "anyURI"; } elsif ($value =~ /^[0-9]{2,4}[\/ -][0-9]{2}[\/ -][0-9]{2,4}$/) { return "date"; } return "string"; } # Prints standard start to all pages sub print_header { print "Content-type: text/html; charset=ISO-8859-1\n\n"; # if the script is being asked to convert the referring document, we extract this URL now my $badref = 0; if ($url =~ /referr?er/) { if ($ENV{'HTTP_REFERER'}) { $url = $ENV{'HTTP_REFERER'}; } else { $badref = 1; } } print < Form2WSDL Conversion Results for “$url” HERE my $openfile = "includes/headstuff.inc"; open(SSIFILE,"$openfile") || die "Cannot read from $openfile: $!"; while() { print $_; } close(SSIFILE); if ($badref) { &print_error("No referrer was found.", 1); } } sub print_footer { print <

A Little Metadata...

Your user agent is “$browser”

Your conversion occured at $timestamp. Thank you and good night.

HERE }