]>
Commit | Line | Data |
---|---|---|
6ada93df MK |
1 | From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001 |
2 | From: Stefan Behnel <stefan_ml@behnel.de> | |
3 | Date: Fri, 15 Nov 2013 14:49:48 +0100 | |
4 | Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to | |
5 | allow graceful handling of broken URLs | |
6 | ||
7 | --- | |
8 | CHANGES.txt | 4 +++ | |
9 | src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------ | |
10 | src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++--- | |
11 | 3 files changed, 61 insertions(+), 13 deletions(-) | |
12 | ||
13 | diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py | |
14 | index ea88d2b..dd52611 100644 | |
15 | --- a/src/lxml/html/__init__.py | |
16 | +++ b/src/lxml/html/__init__.py | |
17 | @@ -294,15 +294,21 @@ class HtmlMixin(object): | |
18 | ## Link functions | |
19 | ######################################## | |
20 | ||
21 | - def make_links_absolute(self, base_url=None, resolve_base_href=True): | |
22 | + def make_links_absolute(self, base_url=None, resolve_base_href=True, | |
23 | + handle_failures=None): | |
24 | """ | |
25 | Make all links in the document absolute, given the | |
26 | ``base_url`` for the document (the full URL where the document | |
27 | - came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. | |
28 | + came from), or if no ``base_url`` is given, then the ``.base_url`` | |
29 | + of the document. | |
30 | ||
31 | If ``resolve_base_href`` is true, then any ``<base href>`` | |
32 | tags in the document are used *and* removed from the document. | |
33 | If it is false then any such tag is ignored. | |
34 | + | |
35 | + If ``handle_failures`` is None (default), a failure to process | |
36 | + a URL will abort the processing. If set to 'ignore', errors | |
37 | + are ignored. If set to 'discard', failing URLs will be removed. | |
38 | """ | |
39 | if base_url is None: | |
40 | base_url = self.base_url | |
41 | @@ -311,24 +317,48 @@ class HtmlMixin(object): | |
42 | "No base_url given, and the document has no base_url") | |
43 | if resolve_base_href: | |
44 | self.resolve_base_href() | |
45 | - def link_repl(href): | |
46 | - return urljoin(base_url, href) | |
47 | + | |
48 | + if handle_failures == 'ignore': | |
49 | + def link_repl(href): | |
50 | + try: | |
51 | + return urljoin(base_url, href) | |
52 | + except ValueError: | |
53 | + return href | |
54 | + elif handle_failures == 'discard': | |
55 | + def link_repl(href): | |
56 | + try: | |
57 | + return urljoin(base_url, href) | |
58 | + except ValueError: | |
59 | + return None | |
60 | + elif handle_failures is None: | |
61 | + def link_repl(href): | |
62 | + return urljoin(base_url, href) | |
63 | + else: | |
64 | + raise ValueError( | |
65 | + "unexpected value for handle_failures: %r" % handle_failures) | |
66 | + | |
67 | self.rewrite_links(link_repl) | |
68 | ||
69 | - def resolve_base_href(self): | |
70 | + def resolve_base_href(self, handle_failures=None): | |
71 | """ | |
72 | Find any ``<base href>`` tag in the document, and apply its | |
73 | values to all links found in the document. Also remove the | |
74 | tag once it has been applied. | |
75 | + | |
76 | + If ``handle_failures`` is None (default), a failure to process | |
77 | + a URL will abort the processing. If set to 'ignore', errors | |
78 | + are ignored. If set to 'discard', failing URLs will be removed. | |
79 | """ | |
80 | base_href = None | |
81 | - basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) | |
82 | + basetags = self.xpath('//base[@href]|//x:base[@href]', | |
83 | + namespaces={'x': XHTML_NAMESPACE}) | |
84 | for b in basetags: | |
85 | base_href = b.get('href') | |
86 | b.drop_tree() | |
87 | if not base_href: | |
88 | return | |
89 | - self.make_links_absolute(base_href, resolve_base_href=False) | |
90 | + self.make_links_absolute(base_href, resolve_base_href=False, | |
91 | + handle_failures=handle_failures) | |
92 | ||
93 | def iterlinks(self): | |
94 | """ | |
95 | @@ -434,6 +464,7 @@ class HtmlMixin(object): | |
96 | base_href, resolve_base_href=resolve_base_href) | |
97 | elif resolve_base_href: | |
98 | self.resolve_base_href() | |
99 | + | |
100 | for el, attrib, link, pos in self.iterlinks(): | |
101 | new_link = link_repl_func(link.strip()) | |
102 | if new_link == link: | |
103 | diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt | |
104 | index 43dd99d..dd400b7 100644 | |
105 | --- a/src/lxml/html/tests/test_rewritelinks.txt | |
106 | +++ b/src/lxml/html/tests/test_rewritelinks.txt | |
107 | @@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``:: | |
108 | </body> | |
109 | </html> | |
110 | ||
111 | +If the document contains invalid links, you may choose to "discard" or "ignore" | |
112 | +them by passing the respective option into the ``handle_failures`` argument:: | |
113 | + | |
114 | + >>> html = lxml.html.fromstring ('''\ | |
115 | + ... <html><body><div> | |
116 | + ... <a href="http://fancybase.com]Buy">test2</a> | |
117 | + ... </div></body></html>''') | |
118 | + | |
119 | + >>> html.make_links_absolute(base_url="http://my.little.server/url/", | |
120 | + ... handle_failures="discard") | |
121 | + | |
122 | + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) | |
123 | + <html><body><div> | |
124 | + <a>test2</a> | |
125 | + </div></body></html> | |
126 | + | |
127 | Check if we can replace multiple links inside of the same text string:: | |
128 | ||
129 | >>> html = lxml.html.fromstring ("""\ | |
130 | @@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string:: | |
131 | ||
132 | >>> html.make_links_absolute () | |
133 | ||
134 | - >>> try: _unicode = unicode | |
135 | - ... except NameError: _unicode = str | |
136 | - | |
137 | - >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode)) | |
138 | + >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) | |
139 | <html> | |
140 | <head> | |
141 | <title>Test</title> | |
142 | -- | |
143 | 1.8.4.3 | |
144 |