Release 2. Patch for https://bugs.launchpad.net/lxml/+bug/1250557 added.

author Mateusz Korniak <matkor@pld-linux.org>

Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)

committer Mateusz Korniak <matkor@pld-linux.org>

Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)
author Mateusz Korniak <matkor@pld-linux.org>
Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)
committer Mateusz Korniak <matkor@pld-linux.org>
Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)
diff --git a/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch

new file mode 100644 (file)

index 0000000..0c28250
--- /dev/null
+++ b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch
@@ -0,0 +1,144 @@
+From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001
+From: Stefan Behnel <stefan_ml@behnel.de>
+Date: Fri, 15 Nov 2013 14:49:48 +0100
+Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to
+ allow graceful handling of broken URLs
+
+---
+ CHANGES.txt                               |  4 +++
+ src/lxml/html/__init__.py                 | 49 +++++++++++++++++++++++++------
+ src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++---
+ 3 files changed, 61 insertions(+), 13 deletions(-)
+ 
+diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
+index ea88d2b..dd52611 100644
+--- a/src/lxml/html/__init__.py
++++ b/src/lxml/html/__init__.py
+@@ -294,15 +294,21 @@ class HtmlMixin(object):
+     ## Link functions
+     ########################################
+ 
+-    def make_links_absolute(self, base_url=None, resolve_base_href=True):
++    def make_links_absolute(self, base_url=None, resolve_base_href=True,
++                            handle_failures=None):
+         """
+         Make all links in the document absolute, given the
+         ``base_url`` for the document (the full URL where the document
+-        came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
++        came from), or if no ``base_url`` is given, then the ``.base_url``
++        of the document.
+ 
+         If ``resolve_base_href`` is true, then any ``<base href>``
+         tags in the document are used *and* removed from the document.
+         If it is false then any such tag is ignored.
++
++        If ``handle_failures`` is None (default), a failure to process
++        a URL will abort the processing.  If set to 'ignore', errors
++        are ignored.  If set to 'discard', failing URLs will be removed.
+         """
+         if base_url is None:
+             base_url = self.base_url
+@@ -311,24 +317,48 @@ class HtmlMixin(object):
+                     "No base_url given, and the document has no base_url")
+         if resolve_base_href:
+             self.resolve_base_href()
+-        def link_repl(href):
+-            return urljoin(base_url, href)
++
++        if handle_failures == 'ignore':
++            def link_repl(href):
++                try:
++                    return urljoin(base_url, href)
++                except ValueError:
++                    return href
++        elif handle_failures == 'discard':
++            def link_repl(href):
++                try:
++                    return urljoin(base_url, href)
++                except ValueError:
++                    return None
++        elif handle_failures is None:
++            def link_repl(href):
++                return urljoin(base_url, href)
++        else:
++            raise ValueError(
++                "unexpected value for handle_failures: %r" % handle_failures)
++
+         self.rewrite_links(link_repl)
+ 
+-    def resolve_base_href(self):
++    def resolve_base_href(self, handle_failures=None):
+         """
+         Find any ``<base href>`` tag in the document, and apply its
+         values to all links found in the document.  Also remove the
+         tag once it has been applied.
++
++        If ``handle_failures`` is None (default), a failure to process
++        a URL will abort the processing.  If set to 'ignore', errors
++        are ignored.  If set to 'discard', failing URLs will be removed.
+         """
+         base_href = None
+-        basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE})
++        basetags = self.xpath('//base[@href]|//x:base[@href]',
++                              namespaces={'x': XHTML_NAMESPACE})
+         for b in basetags:
+             base_href = b.get('href')
+             b.drop_tree()
+         if not base_href:
+             return
+-        self.make_links_absolute(base_href, resolve_base_href=False)
++        self.make_links_absolute(base_href, resolve_base_href=False,
++                                 handle_failures=handle_failures)
+ 
+     def iterlinks(self):
+         """
+@@ -434,6 +464,7 @@ class HtmlMixin(object):
+                 base_href, resolve_base_href=resolve_base_href)
+         elif resolve_base_href:
+             self.resolve_base_href()
++
+         for el, attrib, link, pos in self.iterlinks():
+             new_link = link_repl_func(link.strip())
+             if new_link == link:
+diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
+index 43dd99d..dd400b7 100644
+--- a/src/lxml/html/tests/test_rewritelinks.txt
++++ b/src/lxml/html/tests/test_rewritelinks.txt
+@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``::
+      </body>
+     </html>
+ 
++If the document contains invalid links, you may choose to "discard" or "ignore"
++them by passing the respective option into the ``handle_failures`` argument::
++
++    >>> html = lxml.html.fromstring ('''\
++    ... <html><body><div>
++    ...     <a href="http://fancybase.com]Buy">test2</a>
++    ... </div></body></html>''')
++
++    >>> html.make_links_absolute(base_url="http://my.little.server/url/",
++    ...                          handle_failures="discard")
++
++    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
++    <html><body><div>
++        <a>test2</a>
++    </div></body></html>
++
+ Check if we can replace multiple links inside of the same text string::
+ 
+     >>> html = lxml.html.fromstring ("""\
+@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string::
+ 
+     >>> html.make_links_absolute ()
+ 
+-    >>> try: _unicode = unicode
+-    ... except NameError: _unicode = str
+-
+-    >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode))
++    >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
+     <html>
+       <head>
+         <title>Test</title>
+-- 
+1.8.4.3
+
diff --git a/python-lxml.spec b/python-lxml.spec

index 40f96e9670290326de8e278f8fff01094e5c2114..8aee4cac7ab12d4ffddd40d4e99bee6f53d64ee3 100644 (file)
--- a/python-lxml.spec
+++ b/python-lxml.spec
@@ -9,11 +9,12 @@ Summary:      Python 2 binding for the libxml2 and libxslt libraries
  Summary(pl.UTF-8):     Wiązanie Pythona 2 do bibliotek libxml2 i libxslt
  Name:          python-%{module}
  Version:       3.2.4
-Release:       1
+Release:       2
  License:       BSD
  Group:         Libraries/Python
  Source0:       http://lxml.de/files/%{module}-%{version}.tgz
  # Source0-md5: cc363499060f615aca1ec8dcc04df331
+Patch0:                %{name}-add-handle_failures-option-to-make_links_absolute-to.patch
  URL:           http://lxml.de/
  BuildRequires: libxml2-devel >= 1:2.7.8
  BuildRequires: libxslt-devel >= 1.1.26
@@ -60,6 +61,7 @@ Dokumentacja API modułu lxml.
  
  %prep
  %setup -q -n %{module}-%{version}
+%patch0 -p1
  
  %build
  %if %{with python2}
author	Mateusz Korniak <matkor@pld-linux.org>
	Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)
committer	Mateusz Korniak <matkor@pld-linux.org>
	Mon, 25 Nov 2013 12:27:17 +0000 (13:27 +0100)
python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch	[new file with mode: 0644]	patch \| blob
python-lxml.spec		patch \| blob \| blame \| history