Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  # Copyright (c) 2004 Ian Bicking. All rights reserved. 
   2  # 
   3  # Redistribution and use in source and binary forms, with or without 
   4  # modification, are permitted provided that the following conditions are 
   5  # met: 
   6  # 
   7  # 1. Redistributions of source code must retain the above copyright 
   8  # notice, this list of conditions and the following disclaimer. 
   9  # 
  10  # 2. Redistributions in binary form must reproduce the above copyright 
  11  # notice, this list of conditions and the following disclaimer in 
  12  # the documentation and/or other materials provided with the 
  13  # distribution. 
  14  # 
  15  # 3. Neither the name of Ian Bicking nor the names of its contributors may 
  16  # be used to endorse or promote products derived from this software 
  17  # without specific prior written permission. 
  18  # 
  19  # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
  20  # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
  21  # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
  22  # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR 
  23  # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 
  24  # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
  25  # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
  26  # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
  27  # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
  28  # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
  29  # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
  30   
  31  """The ``lxml.html`` tool set for HTML handling. 
  32  """ 
  33   
  34  import sys 
  35  import re 
  36  try: 
  37      from urlparse import urljoin 
  38  except ImportError: 
  39      # Python 3 
  40      from urllib.parse import urljoin 
  41  import copy 
  42  from lxml import etree 
  43  from lxml.html import defs 
  44  from lxml.html._setmixin import SetMixin 
  45  try: 
  46      from collections import MutableMapping as DictMixin 
  47  except ImportError: 
  48      # Python < 2.6 
  49      from UserDict import DictMixin 
  50  try: 
  51      set 
  52  except NameError: 
  53      # Python 2.3 
  54      from sets import Set as set 
  55  try: 
  56      bytes 
  57  except NameError: 
  58      # Python < 2.6 
  59      bytes = str 
  60  try: 
  61      unicode 
  62  except NameError: 
  63      # Python 3 
  64      unicode = str 
  65  try: 
  66      basestring 
  67  except NameError: 
  68      # Python 3 
  69      basestring = (str, bytes) 
  70   
71 -def __fix_docstring(s):
72 if not s: 73 return s 74 import sys 75 if sys.version_info[0] >= 3: 76 sub = re.compile(r"^(\s*)u'", re.M).sub 77 else: 78 sub = re.compile(r"^(\s*)b'", re.M).sub 79 return sub(r"\1'", s)
80 81 __all__ = [ 82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 84 'find_rel_links', 'find_class', 'make_links_absolute', 85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 86 87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 88 89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 90 namespaces={'x':XHTML_NAMESPACE}) 91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 92 namespaces={'x':XHTML_NAMESPACE}) 93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 94 namespaces={'x':XHTML_NAMESPACE}) 95 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 98 _collect_string_content = etree.XPath("string()") 99 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 100 _css_import_re = re.compile(r'@import "(.*?)"') 101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 102 namespaces={'x':XHTML_NAMESPACE}) 103 _archive_re = re.compile(r'[^ ]+') 104
105 -def _unquote_match(s, pos):
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 107 return s[1:-1], pos+1 108 else: 109 return s,pos
110
111 -def _transform_result(typ, result):
112 """Convert the result back into the input type. 113 """ 114 if issubclass(typ, bytes): 115 return tostring(result, encoding='utf-8') 116 elif issubclass(typ, unicode): 117 return tostring(result, encoding=unicode) 118 else: 119 return result
120
121 -def _nons(tag):
122 if isinstance(tag, basestring): 123 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 124 return tag.split('}')[-1] 125 return tag
126
127 -class HtmlMixin(object):
128
129 - def base_url(self):
130 """ 131 Returns the base URL, given when the page was parsed. 132 133 Use with ``urlparse.urljoin(el.base_url, href)`` to get 134 absolute URLs. 135 """ 136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__) 138
139 - def forms(self):
140 """ 141 Return a list of all the forms 142 """ 143 return _forms_xpath(self)
144 forms = property(forms, doc=forms.__doc__) 145
146 - def body(self):
147 """ 148 Return the <body> element. Can be called from a child element 149 to get the document's head. 150 """ 151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__) 153
154 - def head(self):
155 """ 156 Returns the <head> element. Can be called from a child 157 element to get the document's head. 158 """ 159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__) 161
162 - def _label__get(self):
163 """ 164 Get or set any <label> element associated with this element. 165 """ 166 id = self.get('id') 167 if not id: 168 return None 169 result = _label_xpath(self, id=id) 170 if not result: 171 return None 172 else: 173 return result[0]
174 - def _label__set(self, label):
175 id = self.get('id') 176 if not id: 177 raise TypeError( 178 "You cannot set a label for an element (%r) that has no id" 179 % self) 180 if _nons(label.tag) != 'label': 181 raise TypeError( 182 "You can only assign label to a label element (not %r)" 183 % label) 184 label.set('for', id)
185 - def _label__del(self):
186 label = self.label 187 if label is not None: 188 del label.attrib['for']
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 190
191 - def drop_tree(self):
192 """ 193 Removes this element from the tree, including its children and 194 text. The tail text is joined to the previous element or 195 parent. 196 """ 197 parent = self.getparent() 198 assert parent is not None 199 if self.tail: 200 previous = self.getprevious() 201 if previous is None: 202 parent.text = (parent.text or '') + self.tail 203 else: 204 previous.tail = (previous.tail or '') + self.tail 205 parent.remove(self)
206
207 - def drop_tag(self):
208 """ 209 Remove the tag, but not its children or text. The children and text 210 are merged into the parent. 211 212 Example:: 213 214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 215 >>> h.find('.//b').drop_tag() 216 >>> print(tostring(h, encoding=unicode)) 217 <div>Hello World!</div> 218 """ 219 parent = self.getparent() 220 assert parent is not None 221 previous = self.getprevious() 222 if self.text and isinstance(self.tag, basestring): 223 # not a Comment, etc. 224 if previous is None: 225 parent.text = (parent.text or '') + self.text 226 else: 227 previous.tail = (previous.tail or '') + self.text 228 if self.tail: 229 if len(self): 230 last = self[-1] 231 last.tail = (last.tail or '') + self.tail 232 elif previous is None: 233 parent.text = (parent.text or '') + self.tail 234 else: 235 previous.tail = (previous.tail or '') + self.tail 236 index = parent.index(self) 237 parent[index:index+1] = self[:]
238 246
247 - def find_class(self, class_name):
248 """ 249 Find any elements with the given class name. 250 """ 251 return _class_xpath(self, class_name=class_name)
252
253 - def get_element_by_id(self, id, *default):
254 """ 255 Get the first element in a document with the given id. If none is 256 found, return the default argument if provided or raise KeyError 257 otherwise. 258 259 Note that there can be more than one element with the same id, 260 and this isn't uncommon in HTML documents found in the wild. 261 Browsers return only the first match, and this function does 262 the same. 263 """ 264 try: 265 # FIXME: should this check for multiple matches? 266 # browsers just return the first one 267 return _id_xpath(self, id=id)[0] 268 except IndexError: 269 if default: 270 return default[0] 271 else: 272 raise KeyError(id)
273
274 - def text_content(self):
275 """ 276 Return the text content of the tag (and the text in any children). 277 """ 278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """ 282 Run the CSS expression on this element and its children, 283 returning a list of the results. 284 285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self) 286 -- note that pre-compiling the expression can provide a substantial 287 speedup. 288 """ 289 # Do the import here to make the dependency optional. 290 from lxml.cssselect import CSSSelector 291 return CSSSelector(expr, translator=translator)(self)
292 293 ######################################## 294 ## Link functions 295 ######################################## 296 327 elif handle_failures == 'discard': 328 def link_repl(href): 329 try: 330 return urljoin(base_url, href) 331 except ValueError: 332 return None
333 elif handle_failures is None: 334 def link_repl(href): 335 return urljoin(base_url, href) 336 else: 337 raise ValueError( 338 "unexpected value for handle_failures: %r" % handle_failures) 339 340 self.rewrite_links(link_repl) 341
342 - def resolve_base_href(self, handle_failures=None):
343 """ 344 Find any ``<base href>`` tag in the document, and apply its 345 values to all links found in the document. Also remove the 346 tag once it has been applied. 347 348 If ``handle_failures`` is None (default), a failure to process 349 a URL will abort the processing. If set to 'ignore', errors 350 are ignored. If set to 'discard', failing URLs will be removed. 351 """ 352 base_href = None 353 basetags = self.xpath('//base[@href]|//x:base[@href]', 354 namespaces={'x': XHTML_NAMESPACE}) 355 for b in basetags: 356 base_href = b.get('href') 357 b.drop_tree() 358 if not base_href: 359 return 360 self.make_links_absolute(base_href, resolve_base_href=False, 361 handle_failures=handle_failures)
362 441 490 491
492 -class _MethodFunc(object):
493 """ 494 An object that represents a method on an element as a function; 495 the function takes either an element or an HTML string. It 496 returns whatever the function normally returns, or if the function 497 works in-place (and so returns None) it returns a serialized form 498 of the resulting document. 499 """
500 - def __init__(self, name, copy=False, source_class=HtmlMixin):
501 self.name = name 502 self.copy = copy 503 self.__doc__ = getattr(source_class, self.name).__doc__
504 - def __call__(self, doc, *args, **kw):
505 result_type = type(doc) 506 if isinstance(doc, basestring): 507 if 'copy' in kw: 508 raise TypeError( 509 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 510 doc = fromstring(doc, **kw) 511 else: 512 if 'copy' in kw: 513 make_a_copy = kw.pop('copy') 514 else: 515 make_a_copy = self.copy 516 if make_a_copy: 517 doc = copy.deepcopy(doc) 518 meth = getattr(doc, self.name) 519 result = meth(*args, **kw) 520 # FIXME: this None test is a bit sloppy 521 if result is None: 522 # Then return what we got in 523 return _transform_result(result_type, doc) 524 else: 525 return result
526 527 find_rel_links = _MethodFunc('find_rel_links', copy=False) 528 find_class = _MethodFunc('find_class', copy=False) 529 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 530 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 531 iterlinks = _MethodFunc('iterlinks', copy=False) 532 rewrite_links = _MethodFunc('rewrite_links', copy=True) 533
534 -class HtmlComment(etree.CommentBase, HtmlMixin):
535 pass
536
537 -class HtmlElement(etree.ElementBase, HtmlMixin):
538 pass
539
540 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
541 pass
542
543 -class HtmlEntity(etree.EntityBase, HtmlMixin):
544 pass
545 546
547 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
548 """A lookup scheme for HTML Element classes. 549 550 To create a lookup instance with different Element classes, pass a tag 551 name mapping of Element classes in the ``classes`` keyword argument and/or 552 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 553 The special key '*' denotes a Mixin class that should be mixed into all 554 Element classes. 555 """ 556 _default_element_classes = {} 557
558 - def __init__(self, classes=None, mixins=None):
559 etree.CustomElementClassLookup.__init__(self) 560 if classes is None: 561 classes = self._default_element_classes.copy() 562 if mixins: 563 mixers = {} 564 for name, value in mixins: 565 if name == '*': 566 for n in classes.keys(): 567 mixers.setdefault(n, []).append(value) 568 else: 569 mixers.setdefault(name, []).append(value) 570 for name, mix_bases in mixers.items(): 571 cur = classes.get(name, HtmlElement) 572 bases = tuple(mix_bases + [cur]) 573 classes[name] = type(cur.__name__, bases, {}) 574 self._element_classes = classes
575
576 - def lookup(self, node_type, document, namespace, name):
577 if node_type == 'element': 578 return self._element_classes.get(name.lower(), HtmlElement) 579 elif node_type == 'comment': 580 return HtmlComment 581 elif node_type == 'PI': 582 return HtmlProcessingInstruction 583 elif node_type == 'entity': 584 return HtmlEntity 585 # Otherwise normal lookup 586 return None
587 588 ################################################################################ 589 # parsing 590 ################################################################################ 591 592 _looks_like_full_html_unicode = re.compile( 593 unicode(r'^\s*<(?:html|!doctype)'), re.I).match 594 _looks_like_full_html_bytes = re.compile( 595 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match 596
597 -def document_fromstring(html, parser=None, **kw):
598 if parser is None: 599 parser = html_parser 600 value = etree.fromstring(html, parser, **kw) 601 if value is None: 602 raise etree.ParserError( 603 "Document is empty") 604 return value
605
606 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 607 parser=None, **kw):
608 """ 609 Parses several HTML elements, returning a list of elements. 610 611 The first item in the list may be a string (though leading 612 whitespace is removed). If no_leading_text is true, then it will 613 be an error if there is leading text, and it will always be a list 614 of only elements. 615 616 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 617 """ 618 if parser is None: 619 parser = html_parser 620 # FIXME: check what happens when you give html with a body, head, etc. 621 if isinstance(html, bytes): 622 if not _looks_like_full_html_bytes(html): 623 html = '<html><body>%s</body></html>'.encode('ascii') % html 624 else: 625 if not _looks_like_full_html_unicode(html): 626 html = '<html><body>%s</body></html>' % html 627 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 628 assert _nons(doc.tag) == 'html' 629 bodies = [e for e in doc if _nons(e.tag) == 'body'] 630 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 631 body = bodies[0] 632 elements = [] 633 if no_leading_text and body.text and body.text.strip(): 634 raise etree.ParserError( 635 "There is leading text: %r" % body.text) 636 if body.text and body.text.strip(): 637 elements.append(body.text) 638 elements.extend(body) 639 # FIXME: removing the reference to the parent artificial document 640 # would be nice 641 return elements
642
643 -def fragment_fromstring(html, create_parent=False, base_url=None, 644 parser=None, **kw):
645 """ 646 Parses a single HTML element; it is an error if there is more than 647 one element, or if anything but whitespace precedes or follows the 648 element. 649 650 If create_parent is true (or is a tag name) then a parent node 651 will be created to encapsulate the HTML in a single element. In 652 this case, leading or trailing text is allowed. 653 654 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 655 """ 656 if parser is None: 657 parser = html_parser 658 659 accept_leading_text = bool(create_parent) 660 661 elements = fragments_fromstring( 662 html, parser=parser, no_leading_text=not accept_leading_text, 663 base_url=base_url, **kw) 664 665 if create_parent: 666 if not isinstance(create_parent, basestring): 667 create_parent = 'div' 668 new_root = Element(create_parent) 669 if elements: 670 if isinstance(elements[0], basestring): 671 new_root.text = elements[0] 672 del elements[0] 673 new_root.extend(elements) 674 return new_root 675 676 if not elements: 677 raise etree.ParserError('No elements found') 678 if len(elements) > 1: 679 raise etree.ParserError( 680 "Multiple elements found (%s)" 681 % ', '.join([_element_name(e) for e in elements])) 682 el = elements[0] 683 if el.tail and el.tail.strip(): 684 raise etree.ParserError( 685 "Element followed by text: %r" % el.tail) 686 el.tail = None 687 return el
688
689 -def fromstring(html, base_url=None, parser=None, **kw):
690 """ 691 Parse the html, returning a single element/document. 692 693 This tries to minimally parse the chunk of text, without knowing if it 694 is a fragment or a document. 695 696 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 697 """ 698 if parser is None: 699 parser = html_parser 700 if isinstance(html, bytes): 701 is_full_html = _looks_like_full_html_bytes(html) 702 else: 703 is_full_html = _looks_like_full_html_unicode(html) 704 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 705 if is_full_html: 706 return doc 707 # otherwise, lets parse it out... 708 bodies = doc.findall('body') 709 if not bodies: 710 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 711 if bodies: 712 body = bodies[0] 713 if len(bodies) > 1: 714 # Somehow there are multiple bodies, which is bad, but just 715 # smash them into one body 716 for other_body in bodies[1:]: 717 if other_body.text: 718 if len(body): 719 body[-1].tail = (body[-1].tail or '') + other_body.text 720 else: 721 body.text = (body.text or '') + other_body.text 722 body.extend(other_body) 723 # We'll ignore tail 724 # I guess we are ignoring attributes too 725 other_body.drop_tree() 726 else: 727 body = None 728 heads = doc.findall('head') 729 if not heads: 730 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 731 if heads: 732 # Well, we have some sort of structure, so lets keep it all 733 head = heads[0] 734 if len(heads) > 1: 735 for other_head in heads[1:]: 736 head.extend(other_head) 737 # We don't care about text or tail in a head 738 other_head.drop_tree() 739 return doc 740 if body is None: 741 return doc 742 if (len(body) == 1 and (not body.text or not body.text.strip()) 743 and (not body[-1].tail or not body[-1].tail.strip())): 744 # The body has just one element, so it was probably a single 745 # element passed in 746 return body[0] 747 # Now we have a body which represents a bunch of tags which have the 748 # content that was passed in. We will create a fake container, which 749 # is the body tag, except <body> implies too much structure. 750 if _contains_block_level_tag(body): 751 body.tag = 'div' 752 else: 753 body.tag = 'span' 754 return body
755
756 -def parse(filename_or_url, parser=None, base_url=None, **kw):
757 """ 758 Parse a filename, URL, or file-like object into an HTML document 759 tree. Note: this returns a tree, not an element. Use 760 ``parse(...).getroot()`` to get the document root. 761 762 You can override the base URL with the ``base_url`` keyword. This 763 is most useful when parsing from a file-like object. 764 """ 765 if parser is None: 766 parser = html_parser 767 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
768
769 -def _contains_block_level_tag(el):
770 # FIXME: I could do this with XPath, but would that just be 771 # unnecessarily slow? 772 for el in el.iter(): 773 if _nons(el.tag) in defs.block_tags: 774 return True 775 return False
776
777 -def _element_name(el):
778 if isinstance(el, etree.CommentBase): 779 return 'comment' 780 elif isinstance(el, basestring): 781 return 'string' 782 else: 783 return _nons(el.tag)
784 785 ################################################################################ 786 # form handling 787 ################################################################################ 788
789 -class FormElement(HtmlElement):
790 """ 791 Represents a <form> element. 792 """ 793
794 - def inputs(self):
795 """ 796 Returns an accessor for all the input elements in the form. 797 798 See `InputGetter` for more information about the object. 799 """ 800 return InputGetter(self)
801 inputs = property(inputs, doc=inputs.__doc__) 802
803 - def _fields__get(self):
804 """ 805 Dictionary-like object that represents all the fields in this 806 form. You can set values in this dictionary to effect the 807 form. 808 """ 809 return FieldsDict(self.inputs)
810 - def _fields__set(self, value):
811 prev_keys = self.fields.keys() 812 for key, value in value.items(): 813 if key in prev_keys: 814 prev_keys.remove(key) 815 self.fields[key] = value 816 for key in prev_keys: 817 if key is None: 818 # Case of an unnamed input; these aren't really 819 # expressed in form_values() anyway. 820 continue 821 self.fields[key] = None
822 823 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 824
825 - def _name(self):
826 if self.get('name'): 827 return self.get('name') 828 elif self.get('id'): 829 return '#' + self.get('id') 830 forms = list(self.body.iter('form')) 831 if not forms: 832 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 833 return str(forms.index(self))
834
835 - def form_values(self):
836 """ 837 Return a list of tuples of the field values for the form. 838 This is suitable to be passed to ``urllib.urlencode()``. 839 """ 840 results = [] 841 for el in self.inputs: 842 name = el.name 843 if not name: 844 continue 845 tag = _nons(el.tag) 846 if tag == 'textarea': 847 results.append((name, el.value)) 848 elif tag == 'select': 849 value = el.value 850 if el.multiple: 851 for v in value: 852 results.append((name, v)) 853 elif value is not None: 854 results.append((name, el.value)) 855 else: 856 assert tag == 'input', ( 857 "Unexpected tag: %r" % el) 858 if el.checkable and not el.checked: 859 continue 860 if el.type in ('submit', 'image', 'reset'): 861 continue 862 value = el.value 863 if value is not None: 864 results.append((name, el.value)) 865 return results
866
867 - def _action__get(self):
868 """ 869 Get/set the form's ``action`` attribute. 870 """ 871 base_url = self.base_url 872 action = self.get('action') 873 if base_url and action is not None: 874 return urljoin(base_url, action) 875 else: 876 return action
877 - def _action__set(self, value):
878 self.set('action', value)
879 - def _action__del(self):
880 if 'action' in self.attrib: 881 del self.attrib['action']
882 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 883
884 - def _method__get(self):
885 """ 886 Get/set the form's method. Always returns a capitalized 887 string, and defaults to ``'GET'`` 888 """ 889 return self.get('method', 'GET').upper()
890 - def _method__set(self, value):
891 self.set('method', value.upper())
892 method = property(_method__get, _method__set, doc=_method__get.__doc__)
893 894 HtmlElementClassLookup._default_element_classes['form'] = FormElement 895
896 -def submit_form(form, extra_values=None, open_http=None):
897 """ 898 Helper function to submit a form. Returns a file-like object, as from 899 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 900 which shows the URL if there were any redirects. 901 902 You can use this like:: 903 904 form = doc.forms[0] 905 form.inputs['foo'].value = 'bar' # etc 906 response = form.submit() 907 doc = parse(response) 908 doc.make_links_absolute(response.geturl()) 909 910 To change the HTTP requester, pass a function as ``open_http`` keyword 911 argument that opens the URL for you. The function must have the following 912 signature:: 913 914 open_http(method, URL, values) 915 916 The action is one of 'GET' or 'POST', the URL is the target URL as a 917 string, and the values are a sequence of ``(name, value)`` tuples with the 918 form data. 919 """ 920 values = form.form_values() 921 if extra_values: 922 if hasattr(extra_values, 'items'): 923 extra_values = extra_values.items() 924 values.extend(extra_values) 925 if open_http is None: 926 open_http = open_http_urllib 927 if form.action: 928 url = form.action 929 else: 930 url = form.base_url 931 return open_http(form.method, url, values)
932
933 -def open_http_urllib(method, url, values):
934 if not url: 935 raise ValueError("cannot submit, no URL provided") 936 ## FIXME: should test that it's not a relative URL or something 937 try: 938 from urllib import urlencode, urlopen 939 except ImportError: # Python 3 940 from urllib.request import urlopen 941 from urllib.parse import urlencode 942 if method == 'GET': 943 if '?' in url: 944 url += '&' 945 else: 946 url += '?' 947 url += urlencode(values) 948 data = None 949 else: 950 data = urlencode(values) 951 return urlopen(url, data)
952
953 -class FieldsDict(DictMixin):
954
955 - def __init__(self, inputs):
956 self.inputs = inputs
957 - def __getitem__(self, item):
958 return self.inputs[item].value
959 - def __setitem__(self, item, value):
960 self.inputs[item].value = value
961 - def __delitem__(self, item):
962 raise KeyError( 963 "You cannot remove keys from ElementDict")
964 - def keys(self):
965 return self.inputs.keys()
966 - def __contains__(self, item):
967 return item in self.inputs
968 - def __iter__(self):
969 return iter(self.inputs.keys())
970 - def __len__(self):
971 return len(self.inputs)
972
973 - def __repr__(self):
974 return '<%s for form %s>' % ( 975 self.__class__.__name__, 976 self.inputs.form._name())
977
978 -class InputGetter(object):
979 980 """ 981 An accessor that represents all the input fields in a form. 982 983 You can get fields by name from this, with 984 ``form.inputs['field_name']``. If there are a set of checkboxes 985 with the same name, they are returned as a list (a `CheckboxGroup` 986 which also allows value setting). Radio inputs are handled 987 similarly. 988 989 You can also iterate over this to get all input elements. This 990 won't return the same thing as if you get all the names, as 991 checkboxes and radio elements are returned individually. 992 """ 993 994 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 995 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 996
997 - def __init__(self, form):
998 self.form = form
999
1000 - def __repr__(self):
1001 return '<%s for form %s>' % ( 1002 self.__class__.__name__, 1003 self.form._name())
1004 1005 ## FIXME: there should be more methods, and it's unclear if this is 1006 ## a dictionary-like object or list-like object 1007
1008 - def __getitem__(self, name):
1009 results = self._name_xpath(self.form, name=name) 1010 if results: 1011 type = results[0].get('type') 1012 if type == 'radio' and len(results) > 1: 1013 group = RadioGroup(results) 1014 group.name = name 1015 return group 1016 elif type == 'checkbox' and len(results) > 1: 1017 group = CheckboxGroup(results) 1018 group.name = name 1019 return group 1020 else: 1021 # I don't like throwing away elements like this 1022 return results[0] 1023 else: 1024 raise KeyError( 1025 "No input element with the name %r" % name)
1026
1027 - def __contains__(self, name):
1028 results = self._name_xpath(self.form, name=name) 1029 return bool(results)
1030
1031 - def keys(self):
1032 names = set() 1033 for el in self: 1034 names.add(el.name) 1035 if None in names: 1036 names.remove(None) 1037 return list(names)
1038
1039 - def __iter__(self):
1040 ## FIXME: kind of dumb to turn a list into an iterator, only 1041 ## to have it likely turned back into a list again :( 1042 return iter(self._all_xpath(self.form))
1043
1044 -class InputMixin(object):
1045 1046 """ 1047 Mix-in for all input elements (input, select, and textarea) 1048 """ 1049 1050
1051 - def _name__get(self):
1052 """ 1053 Get/set the name of the element 1054 """ 1055 return self.get('name')
1056 - def _name__set(self, value):
1057 self.set('name', value)
1058 - def _name__del(self):
1059 if 'name' in self.attrib: 1060 del self.attrib['name']
1061 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 1062
1063 - def __repr__(self):
1064 type = getattr(self, 'type', None) 1065 if type: 1066 type = ' type=%r' % type 1067 else: 1068 type = '' 1069 return '<%s %x name=%r%s>' % ( 1070 self.__class__.__name__, id(self), self.name, type)
1071
1072 -class TextareaElement(InputMixin, HtmlElement):
1073 """ 1074 ``<textarea>`` element. You can get the name with ``.name`` and 1075 get/set the value with ``.value`` 1076 """ 1077
1078 - def _value__get(self):
1079 """ 1080 Get/set the value (which is the contents of this element) 1081 """ 1082 content = self.text or '' 1083 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1084 serialisation_method = 'xml' 1085 else: 1086 serialisation_method = 'html' 1087 for el in self: 1088 # it's rare that we actually get here, so let's not use ''.join() 1089 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1090 return content
1091 - def _value__set(self, value):
1092 del self[:] 1093 self.text = value
1094 - def _value__del(self):
1095 self.text = '' 1096 del self[:]
1097 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1098 1099 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1100
1101 -class SelectElement(InputMixin, HtmlElement):
1102 """ 1103 ``<select>`` element. You can get the name with ``.name``. 1104 1105 ``.value`` will be the value of the selected option, unless this 1106 is a multi-select element (``<select multiple>``), in which case 1107 it will be a set-like object. In either case ``.value_options`` 1108 gives the possible values. 1109 1110 The boolean attribute ``.multiple`` shows if this is a 1111 multi-select. 1112 """ 1113
1114 - def _value__get(self):
1115 """ 1116 Get/set the value of this select (the selected option). 1117 1118 If this is a multi-select, this is a set-like object that 1119 represents all the selected options. 1120 """ 1121 if self.multiple: 1122 return MultipleSelectOptions(self) 1123 for el in _options_xpath(self): 1124 if el.get('selected') is not None: 1125 value = el.get('value') 1126 if value is None: 1127 value = el.text or '' 1128 if value: 1129 value = value.strip() 1130 return value 1131 return None
1132
1133 - def _value__set(self, value):
1134 if self.multiple: 1135 if isinstance(value, basestring): 1136 raise TypeError( 1137 "You must pass in a sequence") 1138 self.value.clear() 1139 self.value.update(value) 1140 return 1141 if value is not None: 1142 value = value.strip() 1143 for el in _options_xpath(self): 1144 opt_value = el.get('value') 1145 if opt_value is None: 1146 opt_value = el.text or '' 1147 if opt_value: 1148 opt_value = opt_value.strip() 1149 if opt_value == value: 1150 checked_option = el 1151 break 1152 else: 1153 raise ValueError( 1154 "There is no option with the value of %r" % value) 1155 for el in _options_xpath(self): 1156 if 'selected' in el.attrib: 1157 del el.attrib['selected'] 1158 if value is not None: 1159 checked_option.set('selected', '')
1160
1161 - def _value__del(self):
1162 # FIXME: should del be allowed at all? 1163 if self.multiple: 1164 self.value.clear() 1165 else: 1166 self.value = None
1167 1168 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1169
1170 - def value_options(self):
1171 """ 1172 All the possible values this select can have (the ``value`` 1173 attribute of all the ``<option>`` elements. 1174 """ 1175 options = [] 1176 for el in _options_xpath(self): 1177 value = el.get('value') 1178 if value is None: 1179 value = el.text or '' 1180 if value: 1181 value = value.strip() 1182 options.append(value) 1183 return options
1184 value_options = property(value_options, doc=value_options.__doc__) 1185
1186 - def _multiple__get(self):
1187 """ 1188 Boolean attribute: is there a ``multiple`` attribute on this element. 1189 """ 1190 return 'multiple' in self.attrib
1191 - def _multiple__set(self, value):
1192 if value: 1193 self.set('multiple', '') 1194 elif 'multiple' in self.attrib: 1195 del self.attrib['multiple']
1196 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1197 1198 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1199
1200 -class MultipleSelectOptions(SetMixin):
1201 """ 1202 Represents all the selected options in a ``<select multiple>`` element. 1203 1204 You can add to this set-like option to select an option, or remove 1205 to unselect the option. 1206 """ 1207
1208 - def __init__(self, select):
1209 self.select = select
1210
1211 - def options(self):
1212 """ 1213 Iterator of all the ``<option>`` elements. 1214 """ 1215 return iter(_options_xpath(self.select))
1216 options = property(options) 1217
1218 - def __iter__(self):
1219 for option in self.options: 1220 if 'selected' in option.attrib: 1221 opt_value = option.get('value') 1222 if opt_value is None: 1223 opt_value = option.text or '' 1224 if opt_value: 1225 opt_value = opt_value.strip() 1226 yield opt_value
1227
1228 - def add(self, item):
1229 for option in self.options: 1230 opt_value = option.get('value') 1231 if opt_value is None: 1232 opt_value = option.text or '' 1233 if opt_value: 1234 opt_value = opt_value.strip() 1235 if opt_value == item: 1236 option.set('selected', '') 1237 break 1238 else: 1239 raise ValueError( 1240 "There is no option with the value %r" % item)
1241
1242 - def remove(self, item):
1243 for option in self.options: 1244 opt_value = option.get('value') 1245 if opt_value is None: 1246 opt_value = option.text or '' 1247 if opt_value: 1248 opt_value = opt_value.strip() 1249 if opt_value == item: 1250 if 'selected' in option.attrib: 1251 del option.attrib['selected'] 1252 else: 1253 raise ValueError( 1254 "The option %r is not currently selected" % item) 1255 break 1256 else: 1257 raise ValueError( 1258 "There is not option with the value %r" % item)
1259
1260 - def __repr__(self):
1261 return '<%s {%s} for select name=%r>' % ( 1262 self.__class__.__name__, 1263 ', '.join([repr(v) for v in self]), 1264 self.select.name)
1265
1266 -class RadioGroup(list):
1267 """ 1268 This object represents several ``<input type=radio>`` elements 1269 that have the same name. 1270 1271 You can use this like a list, but also use the property 1272 ``.value`` to check/uncheck inputs. Also you can use 1273 ``.value_options`` to get the possible values. 1274 """ 1275
1276 - def _value__get(self):
1277 """ 1278 Get/set the value, which checks the radio with that value (and 1279 unchecks any other value). 1280 """ 1281 for el in self: 1282 if 'checked' in el.attrib: 1283 return el.get('value') 1284 return None
1285
1286 - def _value__set(self, value):
1287 if value is not None: 1288 for el in self: 1289 if el.get('value') == value: 1290 checked_option = el 1291 break 1292 else: 1293 raise ValueError( 1294 "There is no radio input with the value %r" % value) 1295 for el in self: 1296 if 'checked' in el.attrib: 1297 del el.attrib['checked'] 1298 if value is not None: 1299 checked_option.set('checked', '')
1300
1301 - def _value__del(self):
1302 self.value = None
1303 1304 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1305
1306 - def value_options(self):
1307 """ 1308 Returns a list of all the possible values. 1309 """ 1310 return [el.get('value') for el in self]
1311 value_options = property(value_options, doc=value_options.__doc__) 1312
1313 - def __repr__(self):
1314 return '%s(%s)' % ( 1315 self.__class__.__name__, 1316 list.__repr__(self))
1317
1318 -class CheckboxGroup(list):
1319 """ 1320 Represents a group of checkboxes (``<input type=checkbox>``) that 1321 have the same name. 1322 1323 In addition to using this like a list, the ``.value`` attribute 1324 returns a set-like object that you can add to or remove from to 1325 check and uncheck checkboxes. You can also use ``.value_options`` 1326 to get the possible values. 1327 """ 1328
1329 - def _value__get(self):
1330 """ 1331 Return a set-like object that can be modified to check or 1332 uncheck individual checkboxes according to their value. 1333 """ 1334 return CheckboxValues(self)
1335 - def _value__set(self, value):
1336 self.value.clear() 1337 if not hasattr(value, '__iter__'): 1338 raise ValueError( 1339 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1340 % (self[0].name, value)) 1341 self.value.update(value)
1342 - def _value__del(self):
1343 self.value.clear()
1344 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1345
1346 - def value_options(self):
1347 """ 1348 Returns a list of all the possible values. 1349 """ 1350 return [el.get('value') for el in self]
1351 value_options = property(value_options, doc=value_options.__doc__) 1352
1353 - def __repr__(self):
1354 return '%s(%s)' % ( 1355 self.__class__.__name__, list.__repr__(self))
1356
1357 -class CheckboxValues(SetMixin):
1358 1359 """ 1360 Represents the values of the checked checkboxes in a group of 1361 checkboxes with the same name. 1362 """ 1363
1364 - def __init__(self, group):
1365 self.group = group
1366
1367 - def __iter__(self):
1368 return iter([ 1369 el.get('value') 1370 for el in self.group 1371 if 'checked' in el.attrib])
1372
1373 - def add(self, value):
1374 for el in self.group: 1375 if el.get('value') == value: 1376 el.set('checked', '') 1377 break 1378 else: 1379 raise KeyError("No checkbox with value %r" % value)
1380
1381 - def remove(self, value):
1382 for el in self.group: 1383 if el.get('value') == value: 1384 if 'checked' in el.attrib: 1385 del el.attrib['checked'] 1386 else: 1387 raise KeyError( 1388 "The checkbox with value %r was already unchecked" % value) 1389 break 1390 else: 1391 raise KeyError( 1392 "No checkbox with value %r" % value)
1393
1394 - def __repr__(self):
1395 return '<%s {%s} for checkboxes name=%r>' % ( 1396 self.__class__.__name__, 1397 ', '.join([repr(v) for v in self]), 1398 self.group.name)
1399
1400 -class InputElement(InputMixin, HtmlElement):
1401 """ 1402 Represents an ``<input>`` element. 1403 1404 You can get the type with ``.type`` (which is lower-cased and 1405 defaults to ``'text'``). 1406 1407 Also you can get and set the value with ``.value`` 1408 1409 Checkboxes and radios have the attribute ``input.checkable == 1410 True`` (for all others it is false) and a boolean attribute 1411 ``.checked``. 1412 1413 """ 1414 1415 ## FIXME: I'm a little uncomfortable with the use of .checked
1416 - def _value__get(self):
1417 """ 1418 Get/set the value of this element, using the ``value`` attribute. 1419 1420 Also, if this is a checkbox and it has no value, this defaults 1421 to ``'on'``. If it is a checkbox or radio that is not 1422 checked, this returns None. 1423 """ 1424 if self.checkable: 1425 if self.checked: 1426 return self.get('value') or 'on' 1427 else: 1428 return None 1429 return self.get('value')
1430 - def _value__set(self, value):
1431 if self.checkable: 1432 if not value: 1433 self.checked = False 1434 else: 1435 self.checked = True 1436 if isinstance(value, basestring): 1437 self.set('value', value) 1438 else: 1439 self.set('value', value)
1440 - def _value__del(self):
1441 if self.checkable: 1442 self.checked = False 1443 else: 1444 if 'value' in self.attrib: 1445 del self.attrib['value']
1446 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1447
1448 - def _type__get(self):
1449 """ 1450 Return the type of this element (using the type attribute). 1451 """ 1452 return self.get('type', 'text').lower()
1453 - def _type__set(self, value):
1454 self.set('type', value)
1455 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1456
1457 - def checkable(self):
1458 """ 1459 Boolean: can this element be checked? 1460 """ 1461 return self.type in ['checkbox', 'radio']
1462 checkable = property(checkable, doc=checkable.__doc__) 1463
1464 - def _checked__get(self):
1465 """ 1466 Boolean attribute to get/set the presence of the ``checked`` 1467 attribute. 1468 1469 You can only use this on checkable input types. 1470 """ 1471 if not self.checkable: 1472 raise AttributeError('Not a checkable input type') 1473 return 'checked' in self.attrib
1474 - def _checked__set(self, value):
1475 if not self.checkable: 1476 raise AttributeError('Not a checkable input type') 1477 if value: 1478 self.set('checked', '') 1479 else: 1480 if 'checked' in self.attrib: 1481 del self.attrib['checked']
1482 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1483 1484 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1485
1486 -class LabelElement(HtmlElement):
1487 """ 1488 Represents a ``<label>`` element. 1489 1490 Label elements are linked to other elements with their ``for`` 1491 attribute. You can access this element with ``label.for_element``. 1492 """ 1493
1494 - def _for_element__get(self):
1495 """ 1496 Get/set the element this label points to. Return None if it 1497 can't be found. 1498 """ 1499 id = self.get('for') 1500 if not id: 1501 return None 1502 return self.body.get_element_by_id(id)
1503 - def _for_element__set(self, other):
1504 id = other.get('id') 1505 if not id: 1506 raise TypeError( 1507 "Element %r has no id attribute" % other) 1508 self.set('for', id)
1509 - def _for_element__del(self):
1510 if 'id' in self.attrib: 1511 del self.attrib['id']
1512 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1513 doc=_for_element__get.__doc__)
1514 1515 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1516 1517 ############################################################ 1518 ## Serialization 1519 ############################################################ 1520
1521 -def html_to_xhtml(html):
1522 """Convert all tags in an HTML tree to XHTML by moving them to the 1523 XHTML namespace. 1524 """ 1525 try: 1526 html = html.getroot() 1527 except AttributeError: 1528 pass 1529 prefix = "{%s}" % XHTML_NAMESPACE 1530 for el in html.iter(): 1531 tag = el.tag 1532 if isinstance(tag, basestring): 1533 if tag[0] != '{': 1534 el.tag = prefix + tag
1535
1536 -def xhtml_to_html(xhtml):
1537 """Convert all tags in an XHTML tree to HTML by removing their 1538 XHTML namespace. 1539 """ 1540 try: 1541 xhtml = xhtml.getroot() 1542 except AttributeError: 1543 pass 1544 prefix = "{%s}" % XHTML_NAMESPACE 1545 prefix_len = len(prefix) 1546 for el in xhtml.iter(prefix + "*"): 1547 el.tag = el.tag[prefix_len:]
1548 1549 # This isn't a general match, but it's a match for what libxml2 1550 # specifically serialises: 1551 __str_replace_meta_content_type = re.compile( 1552 r'<meta http-equiv="Content-Type"[^>]*>').sub 1553 __bytes_replace_meta_content_type = re.compile( 1554 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1555
1556 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1557 encoding=None, method="html", with_tail=True, doctype=None):
1558 """Return an HTML string representation of the document. 1559 1560 Note: if include_meta_content_type is true this will create a 1561 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1562 regardless of the value of include_meta_content_type any existing 1563 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1564 1565 The ``encoding`` argument controls the output encoding (defauts to 1566 ASCII, with &#...; character references for any characters outside 1567 of ASCII). Note that you can pass the name ``'unicode'`` as 1568 ``encoding`` argument to serialise to a unicode string. 1569 1570 The ``method`` argument defines the output method. It defaults to 1571 'html', but can also be 'xml' for xhtml output, or 'text' to 1572 serialise to plain text without markup. 1573 1574 To leave out the tail text of the top-level element that is being 1575 serialised, pass ``with_tail=False``. 1576 1577 The ``doctype`` option allows passing in a plain string that will 1578 be serialised before the XML tree. Note that passing in non 1579 well-formed content here will make the XML output non well-formed. 1580 Also, an existing doctype in the document tree will not be removed 1581 when serialising an ElementTree instance. 1582 1583 Example:: 1584 1585 >>> from lxml import html 1586 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1587 1588 >>> html.tostring(root) 1589 b'<p>Hello<br>world!</p>' 1590 >>> html.tostring(root, method='html') 1591 b'<p>Hello<br>world!</p>' 1592 1593 >>> html.tostring(root, method='xml') 1594 b'<p>Hello<br/>world!</p>' 1595 1596 >>> html.tostring(root, method='text') 1597 b'Helloworld!' 1598 1599 >>> html.tostring(root, method='text', encoding=unicode) 1600 u'Helloworld!' 1601 1602 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>') 1603 >>> html.tostring(root[0], method='text', encoding=unicode) 1604 u'Helloworld!TAIL' 1605 1606 >>> html.tostring(root[0], method='text', encoding=unicode, with_tail=False) 1607 u'Helloworld!' 1608 1609 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>') 1610 >>> html.tostring(doc, method='html', encoding=unicode) 1611 u'<html><body><p>Hello<br>world!</p></body></html>' 1612 1613 >>> print(html.tostring(doc, method='html', encoding=unicode, 1614 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"' 1615 ... ' "http://www.w3.org/TR/html4/strict.dtd">')) 1616 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> 1617 <html><body><p>Hello<br>world!</p></body></html> 1618 """ 1619 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1620 encoding=encoding, with_tail=with_tail, 1621 doctype=doctype) 1622 if method == 'html' and not include_meta_content_type: 1623 if isinstance(html, str): 1624 html = __str_replace_meta_content_type('', html) 1625 else: 1626 html = __bytes_replace_meta_content_type(bytes(), html) 1627 return html
1628 1629 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1630
1631 -def open_in_browser(doc, encoding=None):
1632 """ 1633 Open the HTML document in a web browser, saving it to a temporary 1634 file to open it. Note that this does not delete the file after 1635 use. This is mainly meant for debugging. 1636 """ 1637 import os 1638 import webbrowser 1639 import tempfile 1640 if not isinstance(doc, etree._ElementTree): 1641 doc = etree.ElementTree(doc) 1642 handle, fn = tempfile.mkstemp(suffix='.html') 1643 f = os.fdopen(handle, 'wb') 1644 try: 1645 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1646 finally: 1647 # we leak the file itself here, but we should at least close it 1648 f.close() 1649 url = 'file://' + fn.replace(os.path.sep, '/') 1650 print(url) 1651 webbrowser.open(url)
1652 1653 ################################################################################ 1654 # configure Element class lookup 1655 ################################################################################ 1656
1657 -class HTMLParser(etree.HTMLParser):
1658 """An HTML parser that is configured to return lxml.html Element 1659 objects. 1660 """
1661 - def __init__(self, **kwargs):
1662 super(HTMLParser, self).__init__(**kwargs) 1663 self.set_element_class_lookup(HtmlElementClassLookup())
1664
1665 -class XHTMLParser(etree.XMLParser):
1666 """An XML parser that is configured to return lxml.html Element 1667 objects. 1668 1669 Note that this parser is not really XHTML aware unless you let it 1670 load a DTD that declares the HTML entities. To do this, make sure 1671 you have the XHTML DTDs installed in your catalogs, and create the 1672 parser like this:: 1673 1674 >>> parser = XHTMLParser(load_dtd=True) 1675 1676 If you additionally want to validate the document, use this:: 1677 1678 >>> parser = XHTMLParser(dtd_validation=True) 1679 1680 For catalog support, see http://www.xmlsoft.org/catalog.html. 1681 """
1682 - def __init__(self, **kwargs):
1683 super(XHTMLParser, self).__init__(**kwargs) 1684 self.set_element_class_lookup(HtmlElementClassLookup())
1685
1686 -def Element(*args, **kw):
1687 """Create a new HTML Element. 1688 1689 This can also be used for XHTML documents. 1690 """ 1691 v = html_parser.makeelement(*args, **kw) 1692 return v
1693 1694 html_parser = HTMLParser() 1695 xhtml_parser = XHTMLParser() 1696