1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 except ImportError:
12
13 from urllib.parse import urlsplit
14 from lxml import etree
15 from lxml.html import defs
16 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
17 from lxml.html import xhtml_to_html, _transform_result
18
19 try:
20 unichr
21 except NameError:
22
23 unichr = chr
24 try:
25 unicode
26 except NameError:
27
28 unicode = str
29 try:
30 bytes
31 except NameError:
32
33 bytes = str
34 try:
35 basestring
36 except NameError:
37 basestring = (str, bytes)
38
39
40 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
41 'word_break', 'word_break_html']
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 _css_javascript_re = re.compile(
65 r'expression\s*\(.*?\)', re.S|re.I)
66
67
68 _css_import_re = re.compile(
69 r'@\s*import', re.I)
70
71
72
73 _javascript_scheme_re = re.compile(
74 r'\s*(?:javascript|jscript|livescript|vbscript|data|about|mocha):', re.I)
75 _substitute_whitespace = re.compile(r'\s+').sub
76
77
78
79 _conditional_comment_re = re.compile(
80 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
81
82 _find_styled_elements = etree.XPath(
83 "descendant-or-self::*[@style]")
84
85 _find_external_links = etree.XPath(
86 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
87 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
88 namespaces={'x':XHTML_NAMESPACE})
89
91 """
92 Instances cleans the document of each of the possible offending
93 elements. The cleaning is controlled by attributes; you can
94 override attributes in a subclass, or set them in the constructor.
95
96 ``scripts``:
97 Removes any ``<script>`` tags.
98
99 ``javascript``:
100 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
101 as they could contain Javascript.
102
103 ``comments``:
104 Removes any comments.
105
106 ``style``:
107 Removes any style tags or attributes.
108
109 ``links``:
110 Removes any ``<link>`` tags
111
112 ``meta``:
113 Removes any ``<meta>`` tags
114
115 ``page_structure``:
116 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
117
118 ``processing_instructions``:
119 Removes any processing instructions.
120
121 ``embedded``:
122 Removes any embedded objects (flash, iframes)
123
124 ``frames``:
125 Removes any frame-related tags
126
127 ``forms``:
128 Removes any form tags
129
130 ``annoying_tags``:
131 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
132
133 ``remove_tags``:
134 A list of tags to remove. Only the tags will be removed,
135 their content will get pulled up into the parent tag.
136
137 ``kill_tags``:
138 A list of tags to kill. Killing also removes the tag's content,
139 i.e. the whole subtree, not just the tag itself.
140
141 ``allow_tags``:
142 A list of tags to include (default include all).
143
144 ``remove_unknown_tags``:
145 Remove any tags that aren't standard parts of HTML.
146
147 ``safe_attrs_only``:
148 If true, only include 'safe' attributes (specifically the list
149 from the feedparser HTML sanitisation web site).
150
151 ``safe_attrs``:
152 A set of attribute names to override the default list of attributes
153 considered 'safe' (when safe_attrs_only=True).
154
155 ``add_nofollow``:
156 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
157
158 ``host_whitelist``:
159 A list or set of hosts that you can use for embedded content
160 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
161 You can also implement/override the method
162 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
163 implement more complex rules for what can be embedded.
164 Anything that passes this test will be shown, regardless of
165 the value of (for instance) ``embedded``.
166
167 Note that this parameter might not work as intended if you do not
168 make the links absolute before doing the cleaning.
169
170 Note that you may also need to set ``whitelist_tags``.
171
172 ``whitelist_tags``:
173 A set of tags that can be included with ``host_whitelist``.
174 The default is ``iframe`` and ``embed``; you may wish to
175 include other tags like ``script``, or you may want to
176 implement ``allow_embedded_url`` for more control. Set to None to
177 include all tags.
178
179 This modifies the document *in place*.
180 """
181
182 scripts = True
183 javascript = True
184 comments = True
185 style = False
186 links = True
187 meta = True
188 page_structure = True
189 processing_instructions = True
190 embedded = True
191 frames = True
192 forms = True
193 annoying_tags = True
194 remove_tags = None
195 allow_tags = None
196 kill_tags = None
197 remove_unknown_tags = True
198 safe_attrs_only = True
199 safe_attrs = defs.safe_attrs
200 add_nofollow = False
201 host_whitelist = ()
202 whitelist_tags = set(['iframe', 'embed'])
203
210
211
212
213 _tag_link_attrs = dict(
214 script='src',
215 link='href',
216
217
218 applet=['code', 'object'],
219 iframe='src',
220 embed='src',
221 layer='src',
222
223
224
225
226
227
228
229
230 a='href',
231 )
232
234 """
235 Cleans the document.
236 """
237 if hasattr(doc, 'getroot'):
238
239 doc = doc.getroot()
240
241 xhtml_to_html(doc)
242
243
244 for el in doc.iter('image'):
245 el.tag = 'img'
246 if not self.comments:
247
248
249 self.kill_conditional_comments(doc)
250
251 kill_tags = set(self.kill_tags or ())
252 remove_tags = set(self.remove_tags or ())
253 allow_tags = set(self.allow_tags or ())
254
255 if self.scripts:
256 kill_tags.add('script')
257 if self.safe_attrs_only:
258 safe_attrs = set(self.safe_attrs)
259 for el in doc.iter():
260 attrib = el.attrib
261 for aname in attrib.keys():
262 if aname not in safe_attrs:
263 del attrib[aname]
264 if self.javascript:
265 if not (self.safe_attrs_only and
266 self.safe_attrs == defs.safe_attrs):
267
268 for el in doc.iter():
269 attrib = el.attrib
270 for aname in attrib.keys():
271 if aname.startswith('on'):
272 del attrib[aname]
273 doc.rewrite_links(self._remove_javascript_link,
274 resolve_base_href=False)
275 if not self.style:
276
277
278 for el in _find_styled_elements(doc):
279 old = el.get('style')
280 new = _css_javascript_re.sub('', old)
281 new = _css_import_re.sub('', new)
282 if self._has_sneaky_javascript(new):
283
284 del el.attrib['style']
285 elif new != old:
286 el.set('style', new)
287 for el in list(doc.iter('style')):
288 if el.get('type', '').lower().strip() == 'text/javascript':
289 el.drop_tree()
290 continue
291 old = el.text or ''
292 new = _css_javascript_re.sub('', old)
293
294 new = _css_import_re.sub('', old)
295 if self._has_sneaky_javascript(new):
296
297 el.text = '/* deleted */'
298 elif new != old:
299 el.text = new
300 if self.comments or self.processing_instructions:
301
302
303
304 kill_tags.add(etree.Comment)
305 if self.processing_instructions:
306 kill_tags.add(etree.ProcessingInstruction)
307 if self.style:
308 kill_tags.add('style')
309 etree.strip_attributes(doc, 'style')
310 if self.links:
311 kill_tags.add('link')
312 elif self.style or self.javascript:
313
314
315 for el in list(doc.iter('link')):
316 if 'stylesheet' in el.get('rel', '').lower():
317
318 if not self.allow_element(el):
319 el.drop_tree()
320 if self.meta:
321 kill_tags.add('meta')
322 if self.page_structure:
323 remove_tags.update(('head', 'html', 'title'))
324 if self.embedded:
325
326
327
328 for el in list(doc.iter('param')):
329 found_parent = False
330 parent = el.getparent()
331 while parent is not None and parent.tag not in ('applet', 'object'):
332 parent = parent.getparent()
333 if parent is None:
334 el.drop_tree()
335 kill_tags.update(('applet',))
336
337 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
338 if self.frames:
339
340
341
342 kill_tags.update(defs.frame_tags)
343 if self.forms:
344 remove_tags.add('form')
345 kill_tags.update(('button', 'input', 'select', 'textarea'))
346 if self.annoying_tags:
347 remove_tags.update(('blink', 'marquee'))
348
349 _remove = []
350 _kill = []
351 for el in doc.iter():
352 if el.tag in kill_tags:
353 if self.allow_element(el):
354 continue
355 _kill.append(el)
356 elif el.tag in remove_tags:
357 if self.allow_element(el):
358 continue
359 _remove.append(el)
360
361 if _remove and _remove[0] == doc:
362
363
364 el = _remove.pop(0)
365 el.tag = 'div'
366 el.attrib.clear()
367 elif _kill and _kill[0] == doc:
368
369
370 el = _kill.pop(0)
371 if el.tag != 'html':
372 el.tag = 'div'
373 el.clear()
374
375 _kill.reverse()
376 for el in _kill:
377 el.drop_tree()
378 for el in _remove:
379 el.drop_tag()
380
381 if self.remove_unknown_tags:
382 if allow_tags:
383 raise ValueError(
384 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
385 allow_tags = set(defs.tags)
386 if allow_tags:
387 bad = []
388 for el in doc.iter():
389 if el.tag not in allow_tags:
390 bad.append(el)
391 if bad:
392 if bad[0] is doc:
393 el = bad.pop(0)
394 el.tag = 'div'
395 el.attrib.clear()
396 for el in bad:
397 el.drop_tag()
398 if self.add_nofollow:
399 for el in _find_external_links(doc):
400 if not self.allow_follow(el):
401 rel = el.get('rel')
402 if rel:
403 if ('nofollow' in rel
404 and ' nofollow ' in (' %s ' % rel)):
405 continue
406 rel = '%s nofollow' % rel
407 else:
408 rel = 'nofollow'
409 el.set('rel', rel)
410
412 """
413 Override to suppress rel="nofollow" on some anchors.
414 """
415 return False
416
418 if el.tag not in self._tag_link_attrs:
419 return False
420 attr = self._tag_link_attrs[el.tag]
421 if isinstance(attr, (list, tuple)):
422 for one_attr in attr:
423 url = el.get(one_attr)
424 if not url:
425 return False
426 if not self.allow_embedded_url(el, url):
427 return False
428 return True
429 else:
430 url = el.get(attr)
431 if not url:
432 return False
433 return self.allow_embedded_url(el, url)
434
436 if (self.whitelist_tags is not None
437 and el.tag not in self.whitelist_tags):
438 return False
439 scheme, netloc, path, query, fragment = urlsplit(url)
440 netloc = netloc.lower().split(':', 1)[0]
441 if scheme not in ('http', 'https'):
442 return False
443 if netloc in self.host_whitelist:
444 return True
445 return False
446
457
459 bad = []
460 for el in doc.iter(iterate):
461 if condition(el):
462 bad.append(el)
463 for el in bad:
464 el.drop_tree()
465
473
474 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
475
477 """
478 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
479 can get interpreted, or ``expre/* stuff */ssion(...)``. This
480 checks for attempt to do stuff like this.
481
482 Typically the response will be to kill the entire style; if you
483 have just a bit of Javascript in the style another rule will catch
484 that and remove only the Javascript from the style; this catches
485 more sneaky attempts.
486 """
487 style = self._substitute_comments('', style)
488 style = style.replace('\\', '')
489 style = _substitute_whitespace('', style)
490 style = style.lower()
491 if 'javascript:' in style:
492 return True
493 if 'expression(' in style:
494 return True
495 return False
496
505
506 clean = Cleaner()
507 clean_html = clean.clean_html
508
509
510
511
512
513 _link_regexes = [
514 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
515
516 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
517 ]
518
519 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
520
521 _avoid_hosts = [
522 re.compile(r'^localhost', re.I),
523 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
524 re.compile(r'^127\.0\.0\.1$'),
525 ]
526
527 _avoid_classes = ['nolink']
528
533 """
534 Turn any URLs into links.
535
536 It will search for links identified by the given regular
537 expressions (by default mailto and http(s) links).
538
539 It won't link text in an element in avoid_elements, or an element
540 with a class in avoid_classes. It won't link to anything with a
541 host that matches one of the regular expressions in avoid_hosts
542 (default localhost and 127.0.0.1).
543
544 If you pass in an element, the element's tail will not be
545 substituted, only the contents of the element.
546 """
547 if el.tag in avoid_elements:
548 return
549 class_name = el.get('class')
550 if class_name:
551 class_name = class_name.split()
552 for match_class in avoid_classes:
553 if match_class in class_name:
554 return
555 for child in list(el):
556 autolink(child, link_regexes=link_regexes,
557 avoid_elements=avoid_elements,
558 avoid_hosts=avoid_hosts,
559 avoid_classes=avoid_classes)
560 if child.tail:
561 text, tail_children = _link_text(
562 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
563 if tail_children:
564 child.tail = text
565 index = el.index(child)
566 el[index+1:index+1] = tail_children
567 if el.text:
568 text, pre_children = _link_text(
569 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
570 if pre_children:
571 el.text = text
572 el[:0] = pre_children
573
574 -def _link_text(text, link_regexes, avoid_hosts, factory):
575 leading_text = ''
576 links = []
577 last_pos = 0
578 while 1:
579 best_match, best_pos = None, None
580 for regex in link_regexes:
581 regex_pos = last_pos
582 while 1:
583 match = regex.search(text, pos=regex_pos)
584 if match is None:
585 break
586 host = match.group('host')
587 for host_regex in avoid_hosts:
588 if host_regex.search(host):
589 regex_pos = match.end()
590 break
591 else:
592 break
593 if match is None:
594 continue
595 if best_pos is None or match.start() < best_pos:
596 best_match = match
597 best_pos = match.start()
598 if best_match is None:
599
600 if links:
601 assert not links[-1].tail
602 links[-1].tail = text
603 else:
604 assert not leading_text
605 leading_text = text
606 break
607 link = best_match.group(0)
608 end = best_match.end()
609 if link.endswith('.') or link.endswith(','):
610
611 end -= 1
612 link = link[:-1]
613 prev_text = text[:best_match.start()]
614 if links:
615 assert not links[-1].tail
616 links[-1].tail = prev_text
617 else:
618 assert not leading_text
619 leading_text = prev_text
620 anchor = factory('a')
621 anchor.set('href', link)
622 body = best_match.group('body')
623 if not body:
624 body = link
625 if body.endswith('.') or body.endswith(','):
626 body = body[:-1]
627 anchor.text = body
628 links.append(anchor)
629 text = text[end:]
630 return leading_text, links
631
640
641 autolink_html.__doc__ = autolink.__doc__
642
643
644
645
646
647 _avoid_word_break_elements = ['pre', 'textarea', 'code']
648 _avoid_word_break_classes = ['nobreak']
649
654 """
655 Breaks any long words found in the body of the text (not attributes).
656
657 Doesn't effect any of the tags in avoid_elements, by default
658 ``<textarea>`` and ``<pre>``
659
660 Breaks words by inserting ​, which is a unicode character
661 for Zero Width Space character. This generally takes up no space
662 in rendering, but does copy as a space, and in monospace contexts
663 usually takes up space.
664
665 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
666 """
667
668
669 if el.tag in _avoid_word_break_elements:
670 return
671 class_name = el.get('class')
672 if class_name:
673 dont_break = False
674 class_name = class_name.split()
675 for avoid in avoid_classes:
676 if avoid in class_name:
677 dont_break = True
678 break
679 if dont_break:
680 return
681 if el.text:
682 el.text = _break_text(el.text, max_width, break_character)
683 for child in el:
684 word_break(child, max_width=max_width,
685 avoid_elements=avoid_elements,
686 avoid_classes=avoid_classes,
687 break_character=break_character)
688 if child.tail:
689 child.tail = _break_text(child.tail, max_width, break_character)
690
696
697 -def _break_text(text, max_width, break_character):
698 words = text.split()
699 for word in words:
700 if len(word) > max_width:
701 replacement = _insert_break(word, max_width, break_character)
702 text = text.replace(word, replacement)
703 return text
704
705 _break_prefer_re = re.compile(r'[^a-z]', re.I)
706
708 orig_word = word
709 result = ''
710 while len(word) > width:
711 start = word[:width]
712 breaks = list(_break_prefer_re.finditer(start))
713 if breaks:
714 last_break = breaks[-1]
715
716 if last_break.end() > width-10:
717
718
719 start = word[:last_break.end()]
720 result += start + break_character
721 word = word[len(start):]
722 result += word
723 return result
724