]> git.pld-linux.org Git - packages/python-lxml.git/blame - python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
- updated to 3.2.5
[packages/python-lxml.git] / python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
CommitLineData
6ada93df
MK
1From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
2From: Stefan Behnel <stefan_ml@behnel.de>
3Date: Fri, 15 Nov 2013 14:49:48 +0100
4Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
5 allow graceful handling of broken URLs
6
7---
8 CHANGES.txt | 4 +++
9 src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------
10 src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
11 3 files changed, 61 insertions(+), 13 deletions(-)
12
13diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
14index ea88d2b..dd52611 100644
15--- a/src/lxml/html/__init__.py
16+++ b/src/lxml/html/__init__.py
17@@ -294,15 +294,21 @@ class HtmlMixin(object):
18 ## Link functions
19 ########################################
20
21- def make_links_absolute(self, base_url=None, resolve_base_href=True):
22+ def make_links_absolute(self, base_url=None, resolve_base_href=True,
23+ handle_failures=None):
24 """
25 Make all links in the document absolute, given the
26 ``base_url`` for the document (the full URL where the document
27- came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
28+ came from), or if no ``base_url`` is given, then the ``.base_url``
29+ of the document.
30
31 If ``resolve_base_href`` is true, then any ``<base href>``
32 tags in the document are used *and* removed from the document.
33 If it is false then any such tag is ignored.
34+
35+ If ``handle_failures`` is None (default), a failure to process
36+ a URL will abort the processing. If set to 'ignore', errors
37+ are ignored. If set to 'discard', failing URLs will be removed.
38 """
39 if base_url is None:
40 base_url = self.base_url
41@@ -311,24 +317,48 @@ class HtmlMixin(object):
42 "No base_url given, and the document has no base_url")
43 if resolve_base_href:
44 self.resolve_base_href()
45- def link_repl(href):
46- return urljoin(base_url, href)
47+
48+ if handle_failures == 'ignore':
49+ def link_repl(href):
50+ try:
51+ return urljoin(base_url, href)
52+ except ValueError:
53+ return href
54+ elif handle_failures == 'discard':
55+ def link_repl(href):
56+ try:
57+ return urljoin(base_url, href)
58+ except ValueError:
59+ return None
60+ elif handle_failures is None:
61+ def link_repl(href):
62+ return urljoin(base_url, href)
63+ else:
64+ raise ValueError(
65+ "unexpected value for handle_failures: %r" % handle_failures)
66+
67 self.rewrite_links(link_repl)
68
69- def resolve_base_href(self):
70+ def resolve_base_href(self, handle_failures=None):
71 """
72 Find any ``<base href>`` tag in the document, and apply its
73 values to all links found in the document. Also remove the
74 tag once it has been applied.
75+
76+ If ``handle_failures`` is None (default), a failure to process
77+ a URL will abort the processing. If set to 'ignore', errors
78+ are ignored. If set to 'discard', failing URLs will be removed.
79 """
80 base_href = None
81- basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
82+ basetags = self.xpath('//base[@href]|//x:base[@href]',
83+ namespaces={'x': XHTML_NAMESPACE})
84 for b in basetags:
85 base_href = b.get('href')
86 b.drop_tree()
87 if not base_href:
88 return
89- self.make_links_absolute(base_href, resolve_base_href=False)
90+ self.make_links_absolute(base_href, resolve_base_href=False,
91+ handle_failures=handle_failures)
92
93 def iterlinks(self):
94 """
95@@ -434,6 +464,7 @@ class HtmlMixin(object):
96 base_href, resolve_base_href=resolve_base_href)
97 elif resolve_base_href:
98 self.resolve_base_href()
99+
100 for el, attrib, link, pos in self.iterlinks():
101 new_link = link_repl_func(link.strip())
102 if new_link == link:
103diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
104index 43dd99d..dd400b7 100644
105--- a/src/lxml/html/tests/test_rewritelinks.txt
106+++ b/src/lxml/html/tests/test_rewritelinks.txt
107@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
108 </body>
109 </html>
110
111+If the document contains invalid links, you may choose to "discard" or "ignore"
112+them by passing the respective option into the ``handle_failures`` argument::
113+
114+ >>> html = lxml.html.fromstring ('''\
115+ ... <html><body><div>
116+ ... <a href="http://fancybase.com]Buy">test2</a>
117+ ... </div></body></html>''')
118+
119+ >>> html.make_links_absolute(base_url="http://my.little.server/url/",
120+ ... handle_failures="discard")
121+
122+ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
123+ <html><body><div>
124+ <a>test2</a>
125+ </div></body></html>
126+
127 Check if we can replace multiple links inside of the same text string::
128
129 >>> html = lxml.html.fromstring ("""\
130@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
131
132 >>> html.make_links_absolute ()
133
134- >>> try: _unicode = unicode
135- ... except NameError: _unicode = str
136-
137- >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
138+ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
139 <html>
140 <head>
141 <title>Test</title>
142--
1431.8.4.3
144
This page took 0.071375 seconds and 4 git commands to generate.