diff options
author | Elan Ruusamäe | 2010-10-05 11:02:36 (GMT) |
---|---|---|
committer | Elan Ruusamäe | 2010-10-05 11:02:36 (GMT) |
commit | 32ec50ee7712682492d86479478cf2d384f3ead5 (patch) | |
tree | ba1540e88e8fd02ad95518382a294951b3a2e6ae | |
parent | 63cc24802cf6eee92570d42d9d0de5d9654d0237 (diff) | |
download | pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.zip pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.tar.gz |
- use perl HTML::Tree to extract <A> and <IFRAME> from source
Changed files:
pldnotify.awk -> 1.94
-rw-r--r-- | pldnotify.awk | 60 |
1 files changed, 54 insertions, 6 deletions
diff --git a/pldnotify.awk b/pldnotify.awk index 9a01bcb..70fa742 100644 --- a/pldnotify.awk +++ b/pldnotify.awk @@ -193,6 +193,22 @@ function postfix_link(url, link, oldlink) { return link } +# use perl HTML::TreeBuilder module to extract links from html +# it returns TAGNAME LINK in output which is pretty stright forward to parse in awk +function extract_links_cmd(tmpfile) { + return "perl -MHTML::TreeBuilder -e ' \ + my $content = join q//, <>; \ + my $root = new HTML::TreeBuilder; \ + $root->parse($content); \ + my $links_r = $root->extract_links(); \ + \ + for (@{$root->extract_links(qw(a iframe))}) { \ + my($link, $element, $attr, $tag) = @$_; \ + print $tag, q/ /, $link, $/; \ + } \ + ' " tmpfile +} + # get all <A HREF=..> tags from specified URL function get_links(url,filename, errno,link,oneline,retval,odp,wholeodp,lowerodp,tmpfile,cmd) { @@ -272,6 +288,43 @@ function get_links(url,filename, errno,link,oneline,retval,odp,wholeodp,lowero retval = ("WGET ERROR: " errno ": " wholeerr) return retval } + system("rm -f " tmpfileerr) + + urldir = url; + sub(/[^\/]+$/, "", urldir) + +if (USE_PERL) { + cmd = extract_links_cmd(tmpfile) + while (cmd | getline) { + tag = $1 + link = substr($0, length(tag) + 2) + + if (tag == "iframe") { + d("Frame: " link) + if (url !~ /\//) { + link = (urldir link) + d("Frame->: " link) + } + + if (link_seen(link)) { + continue + } + retval = (retval " " get_links(link)) + } + + if (link_seen(link)) { + continue + } + + retval = (retval " " link) + d("href(): " link) + } + close(cmd) + system("rm -f " tmpfile) + + d("Returning: [" retval "]") + return retval +} wholeodp = "" d("Reading success response...") @@ -280,12 +333,7 @@ function get_links(url,filename, errno,link,oneline,retval,odp,wholeodp,lowero # d("Response: " wholeodp) } d("Reponse read done...") - system("rm -f " tmpfile) - system("rm -f " tmpfileerr) - - urldir = url; - sub(/[^\/]+$/, "", urldir) while (match(wholeodp, /<([aA]|[fF][rR][aA][mM][eE])[ \t][^>]*>/) > 0) { d("Processing links...") @@ -359,7 +407,7 @@ function get_links(url,filename, errno,link,oneline,retval,odp,wholeodp,lowero } } - d("Returning: " retval) + d("Returning: [" retval "]") return retval } |