-From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
-From: Stefan Behnel <stefan_ml@behnel.de>
-Date: Fri, 15 Nov 2013 14:49:48 +0100
-Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
- allow graceful handling of broken URLs
-
----
- CHANGES.txt | 4 +++
- src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------
- src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
- 3 files changed, 61 insertions(+), 13 deletions(-)
-
-diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
-index ea88d2b..dd52611 100644
---- a/src/lxml/html/__init__.py
-+++ b/src/lxml/html/__init__.py
-@@ -294,15 +294,21 @@ class HtmlMixin(object):
- ## Link functions
- ########################################
-
-- def make_links_absolute(self, base_url=None, resolve_base_href=True):
-+ def make_links_absolute(self, base_url=None, resolve_base_href=True,
-+ handle_failures=None):
- """
- Make all links in the document absolute, given the
- ``base_url`` for the document (the full URL where the document
-- came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
-+ came from), or if no ``base_url`` is given, then the ``.base_url``
-+ of the document.
-
- If ``resolve_base_href`` is true, then any ``<base href>``
- tags in the document are used *and* removed from the document.
- If it is false then any such tag is ignored.
-+
-+ If ``handle_failures`` is None (default), a failure to process
-+ a URL will abort the processing. If set to 'ignore', errors
-+ are ignored. If set to 'discard', failing URLs will be removed.
- """
- if base_url is None:
- base_url = self.base_url
-@@ -311,24 +317,48 @@ class HtmlMixin(object):
- "No base_url given, and the document has no base_url")
- if resolve_base_href:
- self.resolve_base_href()
-- def link_repl(href):
-- return urljoin(base_url, href)
-+
-+ if handle_failures == 'ignore':
-+ def link_repl(href):
-+ try:
-+ return urljoin(base_url, href)
-+ except ValueError:
-+ return href
-+ elif handle_failures == 'discard':
-+ def link_repl(href):
-+ try:
-+ return urljoin(base_url, href)
-+ except ValueError:
-+ return None
-+ elif handle_failures is None:
-+ def link_repl(href):
-+ return urljoin(base_url, href)
-+ else:
-+ raise ValueError(
-+ "unexpected value for handle_failures: %r" % handle_failures)
-+
- self.rewrite_links(link_repl)
-
-- def resolve_base_href(self):
-+ def resolve_base_href(self, handle_failures=None):
- """
- Find any ``<base href>`` tag in the document, and apply its
- values to all links found in the document. Also remove the
- tag once it has been applied.
-+
-+ If ``handle_failures`` is None (default), a failure to process
-+ a URL will abort the processing. If set to 'ignore', errors
-+ are ignored. If set to 'discard', failing URLs will be removed.
- """
- base_href = None
-- basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
-+ basetags = self.xpath('//base[@href]|//x:base[@href]',
-+ namespaces={'x': XHTML_NAMESPACE})
- for b in basetags:
- base_href = b.get('href')
- b.drop_tree()
- if not base_href:
- return
-- self.make_links_absolute(base_href, resolve_base_href=False)
-+ self.make_links_absolute(base_href, resolve_base_href=False,
-+ handle_failures=handle_failures)
-
- def iterlinks(self):
- """
-@@ -434,6 +464,7 @@ class HtmlMixin(object):
- base_href, resolve_base_href=resolve_base_href)
- elif resolve_base_href:
- self.resolve_base_href()
-+
- for el, attrib, link, pos in self.iterlinks():
- new_link = link_repl_func(link.strip())
- if new_link == link:
-diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
-index 43dd99d..dd400b7 100644
---- a/src/lxml/html/tests/test_rewritelinks.txt
-+++ b/src/lxml/html/tests/test_rewritelinks.txt
-@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
- </body>
- </html>
-
-+If the document contains invalid links, you may choose to "discard" or "ignore"
-+them by passing the respective option into the ``handle_failures`` argument::
-+
-+ >>> html = lxml.html.fromstring ('''\
-+ ... <html><body><div>
-+ ... <a href="http://fancybase.com]Buy">test2</a>
-+ ... </div></body></html>''')
-+
-+ >>> html.make_links_absolute(base_url="http://my.little.server/url/",
-+ ... handle_failures="discard")
-+
-+ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
-+ <html><body><div>
-+ <a>test2</a>
-+ </div></body></html>
-+
- Check if we can replace multiple links inside of the same text string::
-
- >>> html = lxml.html.fromstring ("""\
-@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
-
- >>> html.make_links_absolute ()
-
-- >>> try: _unicode = unicode
-- ... except NameError: _unicode = str
--
-- >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
-+ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
- <html>
- <head>
- <title>Test</title>
---
-1.8.4.3
-