1 From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
2 From: Stefan Behnel <stefan_ml@behnel.de>
3 Date: Fri, 15 Nov 2013 14:49:48 +0100
4 Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
5 allow graceful handling of broken URLs
9 src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------
10 src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
11 3 files changed, 61 insertions(+), 13 deletions(-)
13 diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
14 index ea88d2b..dd52611 100644
15 --- a/src/lxml/html/__init__.py
16 +++ b/src/lxml/html/__init__.py
17 @@ -294,15 +294,21 @@ class HtmlMixin(object):
19 ########################################
21 - def make_links_absolute(self, base_url=None, resolve_base_href=True):
22 + def make_links_absolute(self, base_url=None, resolve_base_href=True,
23 + handle_failures=None):
25 Make all links in the document absolute, given the
26 ``base_url`` for the document (the full URL where the document
27 - came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
28 + came from), or if no ``base_url`` is given, then the ``.base_url``
31 If ``resolve_base_href`` is true, then any ``<base href>``
32 tags in the document are used *and* removed from the document.
33 If it is false then any such tag is ignored.
35 + If ``handle_failures`` is None (default), a failure to process
36 + a URL will abort the processing. If set to 'ignore', errors
37 + are ignored. If set to 'discard', failing URLs will be removed.
40 base_url = self.base_url
41 @@ -311,24 +317,48 @@ class HtmlMixin(object):
42 "No base_url given, and the document has no base_url")
44 self.resolve_base_href()
45 - def link_repl(href):
46 - return urljoin(base_url, href)
48 + if handle_failures == 'ignore':
49 + def link_repl(href):
51 + return urljoin(base_url, href)
54 + elif handle_failures == 'discard':
55 + def link_repl(href):
57 + return urljoin(base_url, href)
60 + elif handle_failures is None:
61 + def link_repl(href):
62 + return urljoin(base_url, href)
65 + "unexpected value for handle_failures: %r" % handle_failures)
67 self.rewrite_links(link_repl)
69 - def resolve_base_href(self):
70 + def resolve_base_href(self, handle_failures=None):
72 Find any ``<base href>`` tag in the document, and apply its
73 values to all links found in the document. Also remove the
74 tag once it has been applied.
76 + If ``handle_failures`` is None (default), a failure to process
77 + a URL will abort the processing. If set to 'ignore', errors
78 + are ignored. If set to 'discard', failing URLs will be removed.
81 - basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
82 + basetags = self.xpath('//base[@href]|//x:base[@href]',
83 + namespaces={'x': XHTML_NAMESPACE})
85 base_href = b.get('href')
89 - self.make_links_absolute(base_href, resolve_base_href=False)
90 + self.make_links_absolute(base_href, resolve_base_href=False,
91 + handle_failures=handle_failures)
95 @@ -434,6 +464,7 @@ class HtmlMixin(object):
96 base_href, resolve_base_href=resolve_base_href)
97 elif resolve_base_href:
98 self.resolve_base_href()
100 for el, attrib, link, pos in self.iterlinks():
101 new_link = link_repl_func(link.strip())
103 diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
104 index 43dd99d..dd400b7 100644
105 --- a/src/lxml/html/tests/test_rewritelinks.txt
106 +++ b/src/lxml/html/tests/test_rewritelinks.txt
107 @@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
111 +If the document contains invalid links, you may choose to "discard" or "ignore"
112 +them by passing the respective option into the ``handle_failures`` argument::
114 + >>> html = lxml.html.fromstring ('''\
115 + ... <html><body><div>
116 + ... <a href="http://fancybase.com]Buy">test2</a>
117 + ... </div></body></html>''')
119 + >>> html.make_links_absolute(base_url="http://my.little.server/url/",
120 + ... handle_failures="discard")
122 + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
125 + </div></body></html>
127 Check if we can replace multiple links inside of the same text string::
129 >>> html = lxml.html.fromstring ("""\
130 @@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
132 >>> html.make_links_absolute ()
134 - >>> try: _unicode = unicode
135 - ... except NameError: _unicode = str
137 - >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
138 + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))