From 6ada93dfa9eef5fd90211395aef60ba663cc9421 Mon Sep 17 00:00:00 2001 From: Mateusz Korniak Date: Mon, 25 Nov 2013 13:27:17 +0100 Subject: [PATCH 1/1] Release 2. Patch for https://bugs.launchpad.net/lxml/+bug/1250557 added. --- ...res-option-to-make_links_absolute-to.patch | 144 ++++++++++++++++++ python-lxml.spec | 4 +- 2 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch diff --git a/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch new file mode 100644 index 0000000..0c28250 --- /dev/null +++ b/python-lxml-add-handle_failures-option-to-make_links_absolute-to.patch @@ -0,0 +1,144 @@ +From ab497930d74c7bcf4b725809508a1fefef453faa Mon Sep 17 00:00:00 2001 +From: Stefan Behnel +Date: Fri, 15 Nov 2013 14:49:48 +0100 +Subject: [PATCH] add 'handle_failures' option to make_links_absolute() to + allow graceful handling of broken URLs + +--- + CHANGES.txt | 4 +++ + src/lxml/html/__init__.py | 49 +++++++++++++++++++++++++------ + src/lxml/html/tests/test_rewritelinks.txt | 21 ++++++++++--- + 3 files changed, 61 insertions(+), 13 deletions(-) + +diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py +index ea88d2b..dd52611 100644 +--- a/src/lxml/html/__init__.py ++++ b/src/lxml/html/__init__.py +@@ -294,15 +294,21 @@ class HtmlMixin(object): + ## Link functions + ######################################## + +- def make_links_absolute(self, base_url=None, resolve_base_href=True): ++ def make_links_absolute(self, base_url=None, resolve_base_href=True, ++ handle_failures=None): + """ + Make all links in the document absolute, given the + ``base_url`` for the document (the full URL where the document +- came from), or if no ``base_url`` is given, then the ``.base_url`` of the document. ++ came from), or if no ``base_url`` is given, then the ``.base_url`` ++ of the document. + + If ``resolve_base_href`` is true, then any ```` + tags in the document are used *and* removed from the document. + If it is false then any such tag is ignored. ++ ++ If ``handle_failures`` is None (default), a failure to process ++ a URL will abort the processing. If set to 'ignore', errors ++ are ignored. If set to 'discard', failing URLs will be removed. + """ + if base_url is None: + base_url = self.base_url +@@ -311,24 +317,48 @@ class HtmlMixin(object): + "No base_url given, and the document has no base_url") + if resolve_base_href: + self.resolve_base_href() +- def link_repl(href): +- return urljoin(base_url, href) ++ ++ if handle_failures == 'ignore': ++ def link_repl(href): ++ try: ++ return urljoin(base_url, href) ++ except ValueError: ++ return href ++ elif handle_failures == 'discard': ++ def link_repl(href): ++ try: ++ return urljoin(base_url, href) ++ except ValueError: ++ return None ++ elif handle_failures is None: ++ def link_repl(href): ++ return urljoin(base_url, href) ++ else: ++ raise ValueError( ++ "unexpected value for handle_failures: %r" % handle_failures) ++ + self.rewrite_links(link_repl) + +- def resolve_base_href(self): ++ def resolve_base_href(self, handle_failures=None): + """ + Find any ```` tag in the document, and apply its + values to all links found in the document. Also remove the + tag once it has been applied. ++ ++ If ``handle_failures`` is None (default), a failure to process ++ a URL will abort the processing. If set to 'ignore', errors ++ are ignored. If set to 'discard', failing URLs will be removed. + """ + base_href = None +- basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) ++ basetags = self.xpath('//base[@href]|//x:base[@href]', ++ namespaces={'x': XHTML_NAMESPACE}) + for b in basetags: + base_href = b.get('href') + b.drop_tree() + if not base_href: + return +- self.make_links_absolute(base_href, resolve_base_href=False) ++ self.make_links_absolute(base_href, resolve_base_href=False, ++ handle_failures=handle_failures) + + def iterlinks(self): + """ +@@ -434,6 +464,7 @@ class HtmlMixin(object): + base_href, resolve_base_href=resolve_base_href) + elif resolve_base_href: + self.resolve_base_href() ++ + for el, attrib, link, pos in self.iterlinks(): + new_link = link_repl_func(link.strip()) + if new_link == link: +diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt +index 43dd99d..dd400b7 100644 +--- a/src/lxml/html/tests/test_rewritelinks.txt ++++ b/src/lxml/html/tests/test_rewritelinks.txt +@@ -185,6 +185,22 @@ An application of ``iterlinks()`` is ``make_links_absolute()``:: + + + ++If the document contains invalid links, you may choose to "discard" or "ignore" ++them by passing the respective option into the ``handle_failures`` argument:: ++ ++ >>> html = lxml.html.fromstring ('''\ ++ ...
++ ... test2 ++ ...
''') ++ ++ >>> html.make_links_absolute(base_url="http://my.little.server/url/", ++ ... handle_failures="discard") ++ ++ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) ++
++ test2 ++
++ + Check if we can replace multiple links inside of the same text string:: + + >>> html = lxml.html.fromstring ("""\ +@@ -209,10 +225,7 @@ Check if we can replace multiple links inside of the same text string:: + + >>> html.make_links_absolute () + +- >>> try: _unicode = unicode +- ... except NameError: _unicode = str +- +- >>> print(lxml.html.tostring (html, pretty_print = True, encoding=_unicode)) ++ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode')) + + + Test +-- +1.8.4.3 + diff --git a/python-lxml.spec b/python-lxml.spec index 40f96e9..8aee4ca 100644 --- a/python-lxml.spec +++ b/python-lxml.spec @@ -9,11 +9,12 @@ Summary: Python 2 binding for the libxml2 and libxslt libraries Summary(pl.UTF-8): Wiązanie Pythona 2 do bibliotek libxml2 i libxslt Name: python-%{module} Version: 3.2.4 -Release: 1 +Release: 2 License: BSD Group: Libraries/Python Source0: http://lxml.de/files/%{module}-%{version}.tgz # Source0-md5: cc363499060f615aca1ec8dcc04df331 +Patch0: %{name}-add-handle_failures-option-to-make_links_absolute-to.patch URL: http://lxml.de/ BuildRequires: libxml2-devel >= 1:2.7.8 BuildRequires: libxslt-devel >= 1.1.26 @@ -60,6 +61,7 @@ Dokumentacja API modułu lxml. %prep %setup -q -n %{module}-%{version} +%patch0 -p1 %build %if %{with python2} -- 2.43.0