From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 15 Nov 2013 14:49:48 +0100 Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to allow graceful handling of broken URLs --- CHANGES.txt | 4 +++ src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------ src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++--- 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index ea88d2b..dd52611 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -294,15 +294,21 @@ class HtmlMixin(object): ## Link functions ######################################## - def make_links_absolute(self, base_url=None, resolve_base_href=True): + def make_links_absolute(self, base_url=None, resolve_base_href=True, + handle_failures=None): """ Make all links in the document absolute, given the ``base_url`` for the document (the full URL where the document - came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. + came from), or if no ``base_url`` is given, then the ``.base_url`` + of the document. If ``resolve_base_href`` is true, then any ```` tags in the document are used *and* removed from the document. If it is false then any such tag is ignored. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. """ if base_url is None: base_url = self.base_url @@ -311,24 +317,48 @@ class HtmlMixin(object): "No base_url given, and the document has no base_url") if resolve_base_href: self.resolve_base_href() - def link_repl(href): - return urljoin(base_url, href) + + if handle_failures == 'ignore': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return href + elif handle_failures == 'discard': + def link_repl(href): + try: + return urljoin(base_url, href) + except ValueError: + return None + elif handle_failures is None: + def link_repl(href): + return urljoin(base_url, href) + else: + raise ValueError( + "unexpected value for handle_failures: %r" % handle_failures) + self.rewrite_links(link_repl) - def resolve_base_href(self): + def resolve_base_href(self, handle_failures=None): """ Find any ```` tag in the document, and apply its values to all links found in the document. Also remove the tag once it has been applied. + + If ``handle_failures`` is None (default), a failure to process + a URL will abort the processing. If set to 'ignore', errors + are ignored. If set to 'discard', failing URLs will be removed. """ base_href = None - basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) + basetags = self.xpath('//base[@href]|//x:base[@href]', + namespaces={'x': XHTML_NAMESPACE}) for b in basetags: base_href = b.get('href') b.drop_tree() if not base_href: return - self.make_links_absolute(base_href, resolve_base_href=False) + self.make_links_absolute(base_href, resolve_base_href=False, + handle_failures=handle_failures) def iterlinks(self): """ @@ -434,6 +464,7 @@ class HtmlMixin(object): base_href, resolve_base_href=resolve_base_href) elif resolve_base_href: self.resolve_base_href() + for el, attrib, link, pos in self.iterlinks(): new_link = link_repl_func(link.strip()) if new_link == link: diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt index 43dd99d..dd400b7 100644 --- a/src/lxml/html/tests/test_rewritelinks.txt +++ b/src/lxml/html/tests/test_rewritelinks.txt @@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``:: +If the document contains invalid links, you may choose to "discard" or "ignore" +them by passing the respective option into the ``handle_failures`` argument:: + + >>> html = lxml.html.fromstring ('''\ + ...
+ ... test2 + ...
''') + + >>> html.make_links_absolute(base_url="http://my.little.server/url/", + ... handle_failures="discard") + + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) +
+ test2 +
+ Check if we can replace multiple links inside of the same text string:: >>> html = lxml.html.fromstring ("""\ @@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string:: >>> html.make_links_absolute () - >>> try: _unicode = unicode - ... except NameError: _unicode = str - - >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode)) + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) Test -- 1.8.4.3