- use perl HTML::Tree to extract <A> and <IFRAME> from source

author Elan Ruusamäe <glen@pld-linux.org>

Tue, 5 Oct 2010 11:02:36 +0000 (11:02 +0000)

committer cvs2git <feedback@pld-linux.org>

Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
author Elan Ruusamäe <glen@pld-linux.org>
Tue, 5 Oct 2010 11:02:36 +0000 (11:02 +0000)
committer cvs2git <feedback@pld-linux.org>
Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)
diff --git a/pldnotify.awk b/pldnotify.awk

index 9a01bcb9465525eef11e04091a7554a4eeb2deb7..70fa742b6e65e2dddba2dc720e092aabad6042be 100644 (file)
--- a/pldnotify.awk
+++ b/pldnotify.awk
@@ -193,6 +193,22 @@ function postfix_link(url, link,   oldlink) {
         return link
  }
  
+# use perl HTML::TreeBuilder module to extract links from html
+# it returns TAGNAME LINK in output which is pretty stright forward to parse in awk
+function extract_links_cmd(tmpfile) {
+       return "perl -MHTML::TreeBuilder -e ' \
+       my $content = join q//, <>; \
+       my $root = new HTML::TreeBuilder; \
+       $root->parse($content); \
+       my $links_r = $root->extract_links(); \
+       \
+       for (@{$root->extract_links(qw(a iframe))}) { \
+               my($link, $element, $attr, $tag) = @$_; \
+               print $tag, q/ /, $link, $/; \
+       } \
+       ' " tmpfile
+}
+
  # get all <A HREF=..> tags from specified URL
  function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowerodp,tmpfile,cmd) {
  
@@ -272,6 +288,43 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
                 retval = ("WGET ERROR: " errno ": " wholeerr)
                 return retval
         }
+       system("rm -f " tmpfileerr)
+
+       urldir = url;
+       sub(/[^\/]+$/, "", urldir)
+
+if (USE_PERL) {
+       cmd = extract_links_cmd(tmpfile)
+       while (cmd | getline) {
+               tag = $1
+               link = substr($0, length(tag) + 2)
+
+               if (tag == "iframe") {
+                       d("Frame: " link)
+                       if (url !~ /\//) {
+                               link = (urldir link)
+                               d("Frame->: " link)
+                       }
+
+                       if (link_seen(link)) {
+                               continue
+                       }
+                       retval = (retval " " get_links(link))
+               }
+
+               if (link_seen(link)) {
+                       continue
+               }
+
+               retval = (retval " " link)
+               d("href(): " link)
+       }
+       close(cmd)
+       system("rm -f " tmpfile)
+
+       d("Returning: [" retval "]")
+       return retval
+}
  
         wholeodp = ""
         d("Reading success response...")
@@ -280,12 +333,7 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
  #              d("Response: " wholeodp)
         }
         d("Reponse read done...")
-
         system("rm -f " tmpfile)
-       system("rm -f " tmpfileerr)
-
-       urldir = url;
-       sub(/[^\/]+$/, "", urldir)
  
         while (match(wholeodp, /<([aA]|[fF][rR][aA][mM][eE])[ \t][^>]*>/) > 0) {
                 d("Processing links...")
@@ -359,7 +407,7 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
                 }
         }
  
-       d("Returning: " retval)
+       d("Returning: [" retval "]")
         return retval
  }
author	Elan Ruusamäe <glen@pld-linux.org>
	Tue, 5 Oct 2010 11:02:36 +0000 (11:02 +0000)
committer	cvs2git <feedback@pld-linux.org>
	Sun, 24 Jun 2012 12:13:13 +0000 (12:13 +0000)