]> git.pld-linux.org Git - packages/python-lxml.git/blob - python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
Release 2. Patch for https://bugs.launchpad.net/lxml/+bug/1250557 added.
[packages/python-lxml.git] / python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
1 From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
2 From: Stefan Behnel <stefan_ml@behnel.de>
3 Date: Fri, 15 Nov 2013 14:49:48 +0100
4 Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
5  allow graceful handling of broken URLs
6
7 ---
8  CHANGES.txt                               |  4 +++
9  src/lxml/html/__init__.py                 | 49 +++++++++++++++++++++++++------
10  src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
11  3 files changed, 61 insertions(+), 13 deletions(-)
12  
13 diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
14 index ea88d2b..dd52611 100644
15 --- a/src/lxml/html/__init__.py
16 +++ b/src/lxml/html/__init__.py
17 @@ -294,15 +294,21 @@ class HtmlMixin(object):
18      ## Link functions
19      ########################################
20  
21 -    def make_links_absolute(self, base_url=None, resolve_base_href=True):
22 +    def make_links_absolute(self, base_url=None, resolve_base_href=True,
23 +                            handle_failures=None):
24          """
25          Make all links in the document absolute, given the
26          ``base_url`` for the document (the full URL where the document
27 -        came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
28 +        came from), or if no ``base_url`` is given, then the ``.base_url``
29 +        of the document.
30  
31          If ``resolve_base_href`` is true, then any ``<base href>``
32          tags in the document are used *and* removed from the document.
33          If it is false then any such tag is ignored.
34 +
35 +        If ``handle_failures`` is None (default), a failure to process
36 +        a URL will abort the processing.  If set to 'ignore', errors
37 +        are ignored.  If set to 'discard', failing URLs will be removed.
38          """
39          if base_url is None:
40              base_url = self.base_url
41 @@ -311,24 +317,48 @@ class HtmlMixin(object):
42                      "No base_url given, and the document has no base_url")
43          if resolve_base_href:
44              self.resolve_base_href()
45 -        def link_repl(href):
46 -            return urljoin(base_url, href)
47 +
48 +        if handle_failures == 'ignore':
49 +            def link_repl(href):
50 +                try:
51 +                    return urljoin(base_url, href)
52 +                except ValueError:
53 +                    return href
54 +        elif handle_failures == 'discard':
55 +            def link_repl(href):
56 +                try:
57 +                    return urljoin(base_url, href)
58 +                except ValueError:
59 +                    return None
60 +        elif handle_failures is None:
61 +            def link_repl(href):
62 +                return urljoin(base_url, href)
63 +        else:
64 +            raise ValueError(
65 +                "unexpected value for handle_failures: %r" % handle_failures)
66 +
67          self.rewrite_links(link_repl)
68  
69 -    def resolve_base_href(self):
70 +    def resolve_base_href(self, handle_failures=None):
71          """
72          Find any ``<base href>`` tag in the document, and apply its
73          values to all links found in the document.  Also remove the
74          tag once it has been applied.
75 +
76 +        If ``handle_failures`` is None (default), a failure to process
77 +        a URL will abort the processing.  If set to 'ignore', errors
78 +        are ignored.  If set to 'discard', failing URLs will be removed.
79          """
80          base_href = None
81 -        basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
82 +        basetags = self.xpath('//base[@href]|//x:base[@href]',
83 +                              namespaces={'x': XHTML_NAMESPACE})
84          for b in basetags:
85              base_href = b.get('href')
86              b.drop_tree()
87          if not base_href:
88              return
89 -        self.make_links_absolute(base_href, resolve_base_href=False)
90 +        self.make_links_absolute(base_href, resolve_base_href=False,
91 +                                 handle_failures=handle_failures)
92  
93      def iterlinks(self):
94          """
95 @@ -434,6 +464,7 @@ class HtmlMixin(object):
96                  base_href, resolve_base_href=resolve_base_href)
97          elif resolve_base_href:
98              self.resolve_base_href()
99 +
100          for el, attrib, link, pos in self.iterlinks():
101              new_link = link_repl_func(link.strip())
102              if new_link == link:
103 diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
104 index 43dd99d..dd400b7 100644
105 --- a/src/lxml/html/tests/test_rewritelinks.txt
106 +++ b/src/lxml/html/tests/test_rewritelinks.txt
107 @@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
108       </body>
109      </html>
110  
111 +If the document contains invalid links, you may choose to "discard" or "ignore"
112 +them by passing the respective option into the ``handle_failures`` argument::
113 +
114 +    >>> html = lxml.html.fromstring ('''\
115 +    ... <html><body><div>
116 +    ...     <a href="http://fancybase.com]Buy">test2</a>
117 +    ... </div></body></html>''')
118 +
119 +    >>> html.make_links_absolute(base_url="http://my.little.server/url/",
120 +    ...                          handle_failures="discard")
121 +
122 +    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
123 +    <html><body><div>
124 +        <a>test2</a>
125 +    </div></body></html>
126 +
127  Check if we can replace multiple links inside of the same text string::
128  
129      >>> html = lxml.html.fromstring ("""\
130 @@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
131  
132      >>> html.make_links_absolute ()
133  
134 -    >>> try: _unicode = unicode
135 -    ... except NameError: _unicode = str
136 -
137 -    >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
138 +    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
139      <html>
140        <head>
141          <title>Test</title>
142 -- 
143 1.8.4.3
144
This page took 0.05572 seconds and 3 git commands to generate.