Upstream-status:Backport --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -70,9 +70,10 @@ _css_import_re = re.compile( # All kinds of schemes besides just javascript: that can cause # execution: -_javascript_scheme_re = re.compile( - r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) -_substitute_whitespace = re.compile(r'\s+').sub +_is_javascript_scheme = re.compile( + r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).search +_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub # FIXME: should data: be blocked? # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx @@ -467,7 +468,7 @@ class Cleaner(object): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE new = _substitute_whitespace('', link) - if _javascript_scheme_re.search(new): + if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' return link --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -1,3 +1,4 @@ +>>> import re >>> from lxml.html import fromstring, tostring >>> from lxml.html.clean import clean, clean_html, Cleaner >>> from lxml.html import usedoctest @@ -17,6 +18,7 @@ ... ... ... a link +... a control char link ... data ... another link ...

a paragraph

@@ -33,7 +35,7 @@ ... ... ''' ->>> print(doc) +>>> print(re.sub('[\x00-\x07\x0E]', '', doc)) @@ -49,6 +51,7 @@ a link + a control char link data another link

a paragraph

@@ -81,6 +84,7 @@ a link + a control char link data another link

a paragraph

@@ -104,6 +108,7 @@ a link + a control char link data another link

a paragraph

@@ -123,6 +128,7 @@ a link + a control char link data another link

a paragraph

@@ -146,6 +152,7 @@ a link + a control char link data another link

a paragraph