- use perl HTML::Tree to extract <A> and <IFRAME> from source

Changed files: pldnotify.awk -> 1.94
author: Elan Ruusamäe 2010-10-05 11:02:36 (GMT)
committer: Elan Ruusamäe 2010-10-05 11:02:36 (GMT)
commit: 32ec50ee7712682492d86479478cf2d384f3ead5 (patch)
tree: ba1540e88e8fd02ad95518382a294951b3a2e6ae
parent: 63cc24802cf6eee92570d42d9d0de5d9654d0237 (diff)
download: pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.zip
pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.tar.gz
1 files changed, 54 insertions, 6 deletions
diff --git a/pldnotify.awk b/pldnotify.awk
index 9a01bcb..70fa742 100644
--- a/pldnotify.awk
+++ b/pldnotify.awk
@@ -193,6 +193,22 @@ function postfix_link(url, link,   oldlink) {
 	return link
 }
 
+# use perl HTML::TreeBuilder module to extract links from html
+# it returns TAGNAME LINK in output which is pretty stright forward to parse in awk
+function extract_links_cmd(tmpfile) {
+	return "perl -MHTML::TreeBuilder -e ' \
+	my $content = join q//, <>; \
+	my $root = new HTML::TreeBuilder; \
+	$root->parse($content); \
+	my $links_r = $root->extract_links(); \
+	\
+	for (@{$root->extract_links(qw(a iframe))}) { \
+		my($link, $element, $attr, $tag) = @$_; \
+		print $tag, q/ /, $link, $/; \
+	} \
+	' " tmpfile
+}
+
 # get all <A HREF=..> tags from specified URL
 function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowerodp,tmpfile,cmd) {
 
@@ -272,6 +288,43 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
 		retval = ("WGET ERROR: " errno ": " wholeerr)
 		return retval
 	}
+	system("rm -f " tmpfileerr)
+
+	urldir = url;
+	sub(/[^\/]+$/, "", urldir)
+
+if (USE_PERL) {
+	cmd = extract_links_cmd(tmpfile)
+	while (cmd | getline) {
+		tag = $1
+		link = substr($0, length(tag) + 2)
+
+		if (tag == "iframe") {
+			d("Frame: " link)
+			if (url !~ /\//) {
+				link = (urldir link)
+				d("Frame->: " link)
+			}
+
+			if (link_seen(link)) {
+				continue
+			}
+			retval = (retval " " get_links(link))
+		}
+
+		if (link_seen(link)) {
+			continue
+		}
+
+		retval = (retval " " link)
+		d("href(): " link)
+	}
+	close(cmd)
+	system("rm -f " tmpfile)
+
+	d("Returning: [" retval "]")
+	return retval
+}
 
 	wholeodp = ""
 	d("Reading success response...")
@@ -280,12 +333,7 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
 #		d("Response: " wholeodp)
 	}
 	d("Reponse read done...")
-
 	system("rm -f " tmpfile)
-	system("rm -f " tmpfileerr)
-
-	urldir = url;
-	sub(/[^\/]+$/, "", urldir)
 
 	while (match(wholeodp, /<([aA]|[fF][rR][aA][mM][eE])[ \t][^>]*>/) > 0) {
 		d("Processing links...")
@@ -359,7 +407,7 @@ function get_links(url,filename,   errno,link,oneline,retval,odp,wholeodp,lowero
 		}
 	}
 
-	d("Returning: " retval)
+	d("Returning: [" retval "]")
 	return retval
 }
author	Elan Ruusamäe	2010-10-05 11:02:36 (GMT)
committer	Elan Ruusamäe	2010-10-05 11:02:36 (GMT)
commit	32ec50ee7712682492d86479478cf2d384f3ead5 (patch)
tree	ba1540e88e8fd02ad95518382a294951b3a2e6ae
parent	63cc24802cf6eee92570d42d9d0de5d9654d0237 (diff)
download	pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.zip pldnotify-32ec50ee7712682492d86479478cf2d384f3ead5.tar.gz