Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  """A cleanup tool for HTML. 
  2   
  3  Removes unwanted tags and content.  See the `Cleaner` class for 
  4  details. 
  5  """ 
  6   
  7  import re 
  8  import copy 
  9  try: 
 10      from urlparse import urlsplit 
 11  except ImportError: 
 12      # Python 3 
 13      from urllib.parse import urlsplit 
 14  from lxml import etree 
 15  from lxml.html import defs 
 16  from lxml.html import fromstring, tostring, XHTML_NAMESPACE 
 17  from lxml.html import xhtml_to_html, _transform_result 
 18   
 19  try: 
 20      unichr 
 21  except NameError: 
 22      # Python 3 
 23      unichr = chr 
 24  try: 
 25      unicode 
 26  except NameError: 
 27      # Python 3 
 28      unicode = str 
 29  try: 
 30      bytes 
 31  except NameError: 
 32      # Python < 2.6 
 33      bytes = str 
 34  try: 
 35      basestring 
 36  except NameError: 
 37      basestring = (str, bytes) 
 38   
 39   
 40  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 41             'word_break', 'word_break_html'] 
 42   
 43  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 44  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 45  # I have multiple kinds of schemes searched; but should schemes be 
 46  #   whitelisted instead? 
 47  # max height? 
 48  # remove images?  Also in CSS?  background attribute? 
 49  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 50  #   allow *just* embedded YouTube movies) 
 51  # Log what was deleted and why? 
 52  # style="behavior: ..." might be bad in IE? 
 53  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 54  #   metas. 
 55  # UTF-7 detections?  Example: 
 56  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 57  #   you don't always have to have the charset set, if the page has no charset 
 58  #   and there's UTF7-like code in it. 
 59  # Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php 
 60   
 61   
 62  # This is an IE-specific construct you can have in a stylesheet to 
 63  # run some Javascript: 
 64  _css_javascript_re = re.compile( 
 65      r'expression\s*\(.*?\)', re.S|re.I) 
 66   
 67  # Do I have to worry about @\nimport? 
 68  _css_import_re = re.compile( 
 69      r'@\s*import', re.I) 
 70   
 71  # All kinds of schemes besides just javascript: that can cause 
 72  # execution: 
 73  _javascript_scheme_re = re.compile( 
 74      r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I) 
 75  _substitute_whitespace = re.compile(r'\s+').sub 
 76  # FIXME: should data: be blocked? 
 77   
 78  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 79  _conditional_comment_re = re.compile( 
 80      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 81   
 82  _find_styled_elements = etree.XPath( 
 83      "descendant-or-self::*[@style]") 
 84   
 85  _find_external_links = etree.XPath( 
 86      ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" 
 87       "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), 
 88      namespaces={'x':XHTML_NAMESPACE}) 
 89   
90 -class Cleaner(object):
91 """ 92 Instances cleans the document of each of the possible offending 93 elements. The cleaning is controlled by attributes; you can 94 override attributes in a subclass, or set them in the constructor. 95 96 ``scripts``: 97 Removes any ``<script>`` tags. 98 99 ``javascript``: 100 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets 101 as they could contain Javascript. 102 103 ``comments``: 104 Removes any comments. 105 106 ``style``: 107 Removes any style tags or attributes. 108 109 ``links``: 110 Removes any ``<link>`` tags 111 112 ``meta``: 113 Removes any ``<meta>`` tags 114 115 ``page_structure``: 116 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 117 118 ``processing_instructions``: 119 Removes any processing instructions. 120 121 ``embedded``: 122 Removes any embedded objects (flash, iframes) 123 124 ``frames``: 125 Removes any frame-related tags 126 127 ``forms``: 128 Removes any form tags 129 130 ``annoying_tags``: 131 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>`` 132 133 ``remove_tags``: 134 A list of tags to remove. Only the tags will be removed, 135 their content will get pulled up into the parent tag. 136 137 ``kill_tags``: 138 A list of tags to kill. Killing also removes the tag's content, 139 i.e. the whole subtree, not just the tag itself. 140 141 ``allow_tags``: 142 A list of tags to include (default include all). 143 144 ``remove_unknown_tags``: 145 Remove any tags that aren't standard parts of HTML. 146 147 ``safe_attrs_only``: 148 If true, only include 'safe' attributes (specifically the list 149 from the feedparser HTML sanitisation web site). 150 151 ``safe_attrs``: 152 A set of attribute names to override the default list of attributes 153 considered 'safe' (when safe_attrs_only=True). 154 155 ``add_nofollow``: 156 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 157 158 ``host_whitelist``: 159 A list or set of hosts that you can use for embedded content 160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc). 161 You can also implement/override the method 162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to 163 implement more complex rules for what can be embedded. 164 Anything that passes this test will be shown, regardless of 165 the value of (for instance) ``embedded``. 166 167 Note that this parameter might not work as intended if you do not 168 make the links absolute before doing the cleaning. 169 170 Note that you may also need to set ``whitelist_tags``. 171 172 ``whitelist_tags``: 173 A set of tags that can be included with ``host_whitelist``. 174 The default is ``iframe`` and ``embed``; you may wish to 175 include other tags like ``script``, or you may want to 176 implement ``allow_embedded_url`` for more control. Set to None to 177 include all tags. 178 179 This modifies the document *in place*. 180 """ 181 182 scripts = True 183 javascript = True 184 comments = True 185 style = False 186 links = True 187 meta = True 188 page_structure = True 189 processing_instructions = True 190 embedded = True 191 frames = True 192 forms = True 193 annoying_tags = True 194 remove_tags = None 195 allow_tags = None 196 kill_tags = None 197 remove_unknown_tags = True 198 safe_attrs_only = True 199 safe_attrs = defs.safe_attrs 200 add_nofollow = False 201 host_whitelist = () 202 whitelist_tags = set(['iframe', 'embed']) 203
204 - def __init__(self, **kw):
205 for name, value in kw.items(): 206 if not hasattr(self, name): 207 raise TypeError( 208 "Unknown parameter: %s=%r" % (name, value)) 209 setattr(self, name, value)
210 211 # Used to lookup the primary URL for a given tag that is up for 212 # removal: 213 _tag_link_attrs = dict( 214 script='src', 215 link='href', 216 # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html 217 # From what I can tell, both attributes can contain a link: 218 applet=['code', 'object'], 219 iframe='src', 220 embed='src', 221 layer='src', 222 # FIXME: there doesn't really seem like a general way to figure out what 223 # links an <object> tag uses; links often go in <param> tags with values 224 # that we don't really know. You'd have to have knowledge about specific 225 # kinds of plugins (probably keyed off classid), and match against those. 226 ##object=?, 227 # FIXME: not looking at the action currently, because it is more complex 228 # than than -- if you keep the form, you should keep the form controls. 229 ##form='action', 230 a='href', 231 ) 232
233 - def __call__(self, doc):
234 """ 235 Cleans the document. 236 """ 237 if hasattr(doc, 'getroot'): 238 # ElementTree instance, instead of an element 239 doc = doc.getroot() 240 # convert XHTML to HTML 241 xhtml_to_html(doc) 242 # Normalize a case that IE treats <image> like <img>, and that 243 # can confuse either this step or later steps. 244 for el in doc.iter('image'): 245 el.tag = 'img' 246 if not self.comments: 247 # Of course, if we were going to kill comments anyway, we don't 248 # need to worry about this 249 self.kill_conditional_comments(doc) 250 251 kill_tags = set(self.kill_tags or ()) 252 remove_tags = set(self.remove_tags or ()) 253 allow_tags = set(self.allow_tags or ()) 254 255 if self.scripts: 256 kill_tags.add('script') 257 if self.safe_attrs_only: 258 safe_attrs = set(self.safe_attrs) 259 for el in doc.iter(): 260 attrib = el.attrib 261 for aname in attrib.keys(): 262 if aname not in safe_attrs: 263 del attrib[aname] 264 if self.javascript: 265 if not (self.safe_attrs_only and 266 self.safe_attrs == defs.safe_attrs): 267 # safe_attrs handles events attributes itself 268 for el in doc.iter(): 269 attrib = el.attrib 270 for aname in attrib.keys(): 271 if aname.startswith('on'): 272 del attrib[aname] 273 doc.rewrite_links(self._remove_javascript_link, 274 resolve_base_href=False) 275 if not self.style: 276 # If we're deleting style then we don't have to remove JS links 277 # from styles, otherwise... 278 for el in _find_styled_elements(doc): 279 old = el.get('style') 280 new = _css_javascript_re.sub('', old) 281 new = _css_import_re.sub('', new) 282 if self._has_sneaky_javascript(new): 283 # Something tricky is going on... 284 del el.attrib['style'] 285 elif new != old: 286 el.set('style', new) 287 for el in list(doc.iter('style')): 288 if el.get('type', '').lower().strip() == 'text/javascript': 289 el.drop_tree() 290 continue 291 old = el.text or '' 292 new = _css_javascript_re.sub('', old) 293 # The imported CSS can do anything; we just can't allow: 294 new = _css_import_re.sub('', old) 295 if self._has_sneaky_javascript(new): 296 # Something tricky is going on... 297 el.text = '/* deleted */' 298 elif new != old: 299 el.text = new 300 if self.comments or self.processing_instructions: 301 # FIXME: why either? I feel like there's some obscure reason 302 # because you can put PIs in comments...? But I've already 303 # forgotten it 304 kill_tags.add(etree.Comment) 305 if self.processing_instructions: 306 kill_tags.add(etree.ProcessingInstruction) 307 if self.style: 308 kill_tags.add('style') 309 etree.strip_attributes(doc, 'style') 310 if self.links: 311 kill_tags.add('link') 312 elif self.style or self.javascript: 313 # We must get rid of included stylesheets if Javascript is not 314 # allowed, as you can put Javascript in them 315 for el in list(doc.iter('link')): 316 if 'stylesheet' in el.get('rel', '').lower(): 317 # Note this kills alternate stylesheets as well 318 if not self.allow_element(el): 319 el.drop_tree() 320 if self.meta: 321 kill_tags.add('meta') 322 if self.page_structure: 323 remove_tags.update(('head', 'html', 'title')) 324 if self.embedded: 325 # FIXME: is <layer> really embedded? 326 # We should get rid of any <param> tags not inside <applet>; 327 # These are not really valid anyway. 328 for el in list(doc.iter('param')): 329 found_parent = False 330 parent = el.getparent() 331 while parent is not None and parent.tag not in ('applet', 'object'): 332 parent = parent.getparent() 333 if parent is None: 334 el.drop_tree() 335 kill_tags.update(('applet',)) 336 # The alternate contents that are in an iframe are a good fallback: 337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param')) 338 if self.frames: 339 # FIXME: ideally we should look at the frame links, but 340 # generally frames don't mix properly with an HTML 341 # fragment anyway. 342 kill_tags.update(defs.frame_tags) 343 if self.forms: 344 remove_tags.add('form') 345 kill_tags.update(('button', 'input', 'select', 'textarea')) 346 if self.annoying_tags: 347 remove_tags.update(('blink', 'marquee')) 348 349 _remove = [] 350 _kill = [] 351 for el in doc.iter(): 352 if el.tag in kill_tags: 353 if self.allow_element(el): 354 continue 355 _kill.append(el) 356 elif el.tag in remove_tags: 357 if self.allow_element(el): 358 continue 359 _remove.append(el) 360 361 if _remove and _remove[0] == doc: 362 # We have to drop the parent-most tag, which we can't 363 # do. Instead we'll rewrite it: 364 el = _remove.pop(0) 365 el.tag = 'div' 366 el.attrib.clear() 367 elif _kill and _kill[0] == doc: 368 # We have to drop the parent-most element, which we can't 369 # do. Instead we'll clear it: 370 el = _kill.pop(0) 371 if el.tag != 'html': 372 el.tag = 'div' 373 el.clear() 374 375 _kill.reverse() # start with innermost tags 376 for el in _kill: 377 el.drop_tree() 378 for el in _remove: 379 el.drop_tag() 380 381 if self.remove_unknown_tags: 382 if allow_tags: 383 raise ValueError( 384 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 385 allow_tags = set(defs.tags) 386 if allow_tags: 387 bad = [] 388 for el in doc.iter(): 389 if el.tag not in allow_tags: 390 bad.append(el) 391 if bad: 392 if bad[0] is doc: 393 el = bad.pop(0) 394 el.tag = 'div' 395 el.attrib.clear() 396 for el in bad: 397 el.drop_tag() 398 if self.add_nofollow: 399 for el in _find_external_links(doc): 400 if not self.allow_follow(el): 401 rel = el.get('rel') 402 if rel: 403 if ('nofollow' in rel 404 and ' nofollow ' in (' %s ' % rel)): 405 continue 406 rel = '%s nofollow' % rel 407 else: 408 rel = 'nofollow' 409 el.set('rel', rel)
410
411 - def allow_follow(self, anchor):
412 """ 413 Override to suppress rel="nofollow" on some anchors. 414 """ 415 return False
416
417 - def allow_element(self, el):
418 if el.tag not in self._tag_link_attrs: 419 return False 420 attr = self._tag_link_attrs[el.tag] 421 if isinstance(attr, (list, tuple)): 422 for one_attr in attr: 423 url = el.get(one_attr) 424 if not url: 425 return False 426 if not self.allow_embedded_url(el, url): 427 return False 428 return True 429 else: 430 url = el.get(attr) 431 if not url: 432 return False 433 return self.allow_embedded_url(el, url)
434
435 - def allow_embedded_url(self, el, url):
436 if (self.whitelist_tags is not None 437 and el.tag not in self.whitelist_tags): 438 return False 439 scheme, netloc, path, query, fragment = urlsplit(url) 440 netloc = netloc.lower().split(':', 1)[0] 441 if scheme not in ('http', 'https'): 442 return False 443 if netloc in self.host_whitelist: 444 return True 445 return False
446
447 - def kill_conditional_comments(self, doc):
448 """ 449 IE conditional comments basically embed HTML that the parser 450 doesn't normally see. We can't allow anything like that, so 451 we'll kill any comments that could be conditional. 452 """ 453 bad = [] 454 self._kill_elements( 455 doc, lambda el: _conditional_comment_re.search(el.text), 456 etree.Comment)
457
458 - def _kill_elements(self, doc, condition, iterate=None):
459 bad = [] 460 for el in doc.iter(iterate): 461 if condition(el): 462 bad.append(el) 463 for el in bad: 464 el.drop_tree()
465 473 474 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub 475
476 - def _has_sneaky_javascript(self, style):
477 """ 478 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 479 can get interpreted, or ``expre/* stuff */ssion(...)``. This 480 checks for attempt to do stuff like this. 481 482 Typically the response will be to kill the entire style; if you 483 have just a bit of Javascript in the style another rule will catch 484 that and remove only the Javascript from the style; this catches 485 more sneaky attempts. 486 """ 487 style = self._substitute_comments('', style) 488 style = style.replace('\\', '') 489 style = _substitute_whitespace('', style) 490 style = style.lower() 491 if 'javascript:' in style: 492 return True 493 if 'expression(' in style: 494 return True 495 return False
496
497 - def clean_html(self, html):
498 result_type = type(html) 499 if isinstance(html, basestring): 500 doc = fromstring(html) 501 else: 502 doc = copy.deepcopy(html) 503 self(doc) 504 return _transform_result(result_type, doc)
505 506 clean = Cleaner() 507 clean_html = clean.clean_html 508 509 ############################################################ 510 ## Autolinking 511 ############################################################ 512 513 _link_regexes = [ 514 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I), 515 # This is conservative, but autolinking can be a bit conservative: 516 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 517 ] 518 519 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 520 521 _avoid_hosts = [ 522 re.compile(r'^localhost', re.I), 523 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 524 re.compile(r'^127\.0\.0\.1$'), 525 ] 526 527 _avoid_classes = ['nolink'] 528 573 631 640 641 autolink_html.__doc__ = autolink.__doc__ 642 643 ############################################################ 644 ## Word wrapping 645 ############################################################ 646 647 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 648 _avoid_word_break_classes = ['nobreak'] 649
650 -def word_break(el, max_width=40, 651 avoid_elements=_avoid_word_break_elements, 652 avoid_classes=_avoid_word_break_classes, 653 break_character=unichr(0x200b)):
654 """ 655 Breaks any long words found in the body of the text (not attributes). 656 657 Doesn't effect any of the tags in avoid_elements, by default 658 ``<textarea>`` and ``<pre>`` 659 660 Breaks words by inserting &#8203;, which is a unicode character 661 for Zero Width Space character. This generally takes up no space 662 in rendering, but does copy as a space, and in monospace contexts 663 usually takes up space. 664 665 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 666 """ 667 # Character suggestion of &#8203 comes from: 668 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 669 if el.tag in _avoid_word_break_elements: 670 return 671 class_name = el.get('class') 672 if class_name: 673 dont_break = False 674 class_name = class_name.split() 675 for avoid in avoid_classes: 676 if avoid in class_name: 677 dont_break = True 678 break 679 if dont_break: 680 return 681 if el.text: 682 el.text = _break_text(el.text, max_width, break_character) 683 for child in el: 684 word_break(child, max_width=max_width, 685 avoid_elements=avoid_elements, 686 avoid_classes=avoid_classes, 687 break_character=break_character) 688 if child.tail: 689 child.tail = _break_text(child.tail, max_width, break_character)
690
691 -def word_break_html(html, *args, **kw):
692 result_type = type(html) 693 doc = fromstring(html) 694 word_break(doc, *args, **kw) 695 return _transform_result(result_type, doc)
696
697 -def _break_text(text, max_width, break_character):
698 words = text.split() 699 for word in words: 700 if len(word) > max_width: 701 replacement = _insert_break(word, max_width, break_character) 702 text = text.replace(word, replacement) 703 return text
704 705 _break_prefer_re = re.compile(r'[^a-z]', re.I) 706
707 -def _insert_break(word, width, break_character):
708 orig_word = word 709 result = '' 710 while len(word) > width: 711 start = word[:width] 712 breaks = list(_break_prefer_re.finditer(start)) 713 if breaks: 714 last_break = breaks[-1] 715 # Only walk back up to 10 characters to find a nice break: 716 if last_break.end() > width-10: 717 # FIXME: should the break character be at the end of the 718 # chunk, or the beginning of the next chunk? 719 start = word[:last_break.end()] 720 result += start + break_character 721 word = word[len(start):] 722 result += word 723 return result
724