1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 """The ``lxml.html`` tool set for HTML handling.
32 """
33
34 import sys
35 import re
36 try:
37 from urlparse import urljoin
38 except ImportError:
39
40 from urllib.parse import urljoin
41 import copy
42 from lxml import etree
43 from lxml.html import defs
44 from lxml.html._setmixin import SetMixin
45 try:
46 from collections import MutableMapping as DictMixin
47 except ImportError:
48
49 from UserDict import DictMixin
50 try:
51 set
52 except NameError:
53
54 from sets import Set as set
55 try:
56 bytes
57 except NameError:
58
59 bytes = str
60 try:
61 unicode
62 except NameError:
63
64 unicode = str
65 try:
66 basestring
67 except NameError:
68
69 basestring = (str, bytes)
70
72 if not s:
73 return s
74 import sys
75 if sys.version_info[0] >= 3:
76 sub = re.compile(r"^(\s*)u'", re.M).sub
77 else:
78 sub = re.compile(r"^(\s*)b'", re.M).sub
79 return sub(r"\1'", s)
80
81 __all__ = [
82 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
83 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
84 'find_rel_links', 'find_class', 'make_links_absolute',
85 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse']
86
87 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
88
89 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
90 namespaces={'x':XHTML_NAMESPACE})
91 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
92 namespaces={'x':XHTML_NAMESPACE})
93 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
94 namespaces={'x':XHTML_NAMESPACE})
95
96 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
97 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
98 _collect_string_content = etree.XPath("string()")
99 _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
100 _iter_css_imports = re.compile(r'@import "(.*?)"').finditer
101 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
102 namespaces={'x':XHTML_NAMESPACE})
103 _archive_re = re.compile(r'[^ ]+')
104
106 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
107 return s[1:-1], pos+1
108 else:
109 return s,pos
110
120
126
128
130 """
131 Returns the base URL, given when the page was parsed.
132
133 Use with ``urlparse.urljoin(el.base_url, href)`` to get
134 absolute URLs.
135 """
136 return self.getroottree().docinfo.URL
137 base_url = property(base_url, doc=base_url.__doc__)
138
144 forms = property(forms, doc=forms.__doc__)
145
147 """
148 Return the <body> element. Can be called from a child element
149 to get the document's head.
150 """
151 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
152 body = property(body, doc=body.__doc__)
153
155 """
156 Returns the <head> element. Can be called from a child
157 element to get the document's head.
158 """
159 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
160 head = property(head, doc=head.__doc__)
161
163 """
164 Get or set any <label> element associated with this element.
165 """
166 id = self.get('id')
167 if not id:
168 return None
169 result = _label_xpath(self, id=id)
170 if not result:
171 return None
172 else:
173 return result[0]
175 id = self.get('id')
176 if not id:
177 raise TypeError(
178 "You cannot set a label for an element (%r) that has no id"
179 % self)
180 if _nons(label.tag) != 'label':
181 raise TypeError(
182 "You can only assign label to a label element (not %r)"
183 % label)
184 label.set('for', id)
189 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
190
192 """
193 Removes this element from the tree, including its children and
194 text. The tail text is joined to the previous element or
195 parent.
196 """
197 parent = self.getparent()
198 assert parent is not None
199 if self.tail:
200 previous = self.getprevious()
201 if previous is None:
202 parent.text = (parent.text or '') + self.tail
203 else:
204 previous.tail = (previous.tail or '') + self.tail
205 parent.remove(self)
206
208 """
209 Remove the tag, but not its children or text. The children and text
210 are merged into the parent.
211
212 Example::
213
214 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
215 >>> h.find('.//b').drop_tag()
216 >>> print(tostring(h, encoding='unicode'))
217 <div>Hello World!</div>
218 """
219 parent = self.getparent()
220 assert parent is not None
221 previous = self.getprevious()
222 if self.text and isinstance(self.tag, basestring):
223
224 if previous is None:
225 parent.text = (parent.text or '') + self.text
226 else:
227 previous.tail = (previous.tail or '') + self.text
228 if self.tail:
229 if len(self):
230 last = self[-1]
231 last.tail = (last.tail or '') + self.tail
232 elif previous is None:
233 parent.text = (parent.text or '') + self.tail
234 else:
235 previous.tail = (previous.tail or '') + self.tail
236 index = parent.index(self)
237 parent[index:index+1] = self[:]
238
240 """
241 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
242 """
243 rel = rel.lower()
244 return [el for el in _rel_links_xpath(self)
245 if el.get('rel').lower() == rel]
246
248 """
249 Find any elements with the given class name.
250 """
251 return _class_xpath(self, class_name=class_name)
252
254 """
255 Get the first element in a document with the given id. If none is
256 found, return the default argument if provided or raise KeyError
257 otherwise.
258
259 Note that there can be more than one element with the same id,
260 and this isn't uncommon in HTML documents found in the wild.
261 Browsers return only the first match, and this function does
262 the same.
263 """
264 try:
265
266
267 return _id_xpath(self, id=id)[0]
268 except IndexError:
269 if default:
270 return default[0]
271 else:
272 raise KeyError(id)
273
274 - def text_content(self):
275 """
276 Return the text content of the tag (and the text in any children).
277 """
278 return _collect_string_content(self)
279
280 - def cssselect(self, expr, translator='html'):
281 """
282 Run the CSS expression on this element and its children,
283 returning a list of the results.
284
285 Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
286 -- note that pre-compiling the expression can provide a substantial
287 speedup.
288 """
289
290 from lxml.cssselect import CSSSelector
291 return CSSSelector(expr, translator=translator)(self)
292
293
294
295
296
297 - def make_links_absolute(self, base_url=None, resolve_base_href=True,
298 handle_failures=None):
299 """
300 Make all links in the document absolute, given the
301 ``base_url`` for the document (the full URL where the document
302 came from), or if no ``base_url`` is given, then the ``.base_url``
303 of the document.
304
305 If ``resolve_base_href`` is true, then any ``<base href>``
306 tags in the document are used *and* removed from the document.
307 If it is false then any such tag is ignored.
308
309 If ``handle_failures`` is None (default), a failure to process
310 a URL will abort the processing. If set to 'ignore', errors
311 are ignored. If set to 'discard', failing URLs will be removed.
312 """
313 if base_url is None:
314 base_url = self.base_url
315 if base_url is None:
316 raise TypeError(
317 "No base_url given, and the document has no base_url")
318 if resolve_base_href:
319 self.resolve_base_href()
320
321 if handle_failures == 'ignore':
322 def link_repl(href):
323 try:
324 return urljoin(base_url, href)
325 except ValueError:
326 return href
327 elif handle_failures == 'discard':
328 def link_repl(href):
329 try:
330 return urljoin(base_url, href)
331 except ValueError:
332 return None
333 elif handle_failures is None:
334 def link_repl(href):
335 return urljoin(base_url, href)
336 else:
337 raise ValueError(
338 "unexpected value for handle_failures: %r" % handle_failures)
339
340 self.rewrite_links(link_repl)
341
343 """
344 Find any ``<base href>`` tag in the document, and apply its
345 values to all links found in the document. Also remove the
346 tag once it has been applied.
347
348 If ``handle_failures`` is None (default), a failure to process
349 a URL will abort the processing. If set to 'ignore', errors
350 are ignored. If set to 'discard', failing URLs will be removed.
351 """
352 base_href = None
353 basetags = self.xpath('//base[@href]|//x:base[@href]',
354 namespaces={'x': XHTML_NAMESPACE})
355 for b in basetags:
356 base_href = b.get('href')
357 b.drop_tree()
358 if not base_href:
359 return
360 self.make_links_absolute(base_href, resolve_base_href=False,
361 handle_failures=handle_failures)
362
364 """
365 Yield (element, attribute, link, pos), where attribute may be None
366 (indicating the link is in the text). ``pos`` is the position
367 where the link occurs; often 0, but sometimes something else in
368 the case of links in stylesheets or style tags.
369
370 Note: <base href> is *not* taken into account in any way. The
371 link you get is exactly the link in the document.
372
373 Note: multiple links inside of a single text string or
374 attribute value are returned in reversed order. This makes it
375 possible to replace or delete them from the text string value
376 based on their reported text positions. Otherwise, a
377 modification at one text position can change the positions of
378 links reported later on.
379 """
380 link_attrs = defs.link_attrs
381 for el in self.iter(etree.Element):
382 attribs = el.attrib
383 tag = _nons(el.tag)
384 if tag == 'object':
385 codebase = None
386
387
388 if 'codebase' in attribs:
389 codebase = el.get('codebase')
390 yield (el, 'codebase', codebase, 0)
391 for attrib in ('classid', 'data'):
392 if attrib in attribs:
393 value = el.get(attrib)
394 if codebase is not None:
395 value = urljoin(codebase, value)
396 yield (el, attrib, value, 0)
397 if 'archive' in attribs:
398 for match in _archive_re.finditer(el.get('archive')):
399 value = match.group(0)
400 if codebase is not None:
401 value = urljoin(codebase, value)
402 yield (el, 'archive', value, match.start())
403 else:
404 for attrib in link_attrs:
405 if attrib in attribs:
406 yield (el, attrib, attribs[attrib], 0)
407 if tag == 'meta':
408 http_equiv = attribs.get('http-equiv', '').lower()
409 if http_equiv == 'refresh':
410 content = attribs.get('content', '')
411 i = content.find(';')
412 url = content[i+1:] if i >= 0 else content
413 if url[:4].lower() == 'url=':
414 url = url[4:]
415
416
417
418 if url:
419 url, pos = _unquote_match(url, i + 5)
420 yield (el, 'content', url, pos)
421 elif tag == 'param':
422 valuetype = el.get('valuetype') or ''
423 if valuetype.lower() == 'ref':
424
425
426
427
428
429
430 yield (el, 'value', el.get('value'), 0)
431 elif tag == 'style' and el.text:
432 urls = [
433
434 _unquote_match(match.group(1), match.start(1))[::-1]
435 for match in _iter_css_urls(el.text)
436 ] + [
437 (match.start(1), match.group(1))
438 for match in _iter_css_imports(el.text)
439 ]
440 if urls:
441
442
443
444 urls.sort(reverse=True)
445 for start, url in urls:
446 yield (el, None, url, start)
447 if 'style' in attribs:
448 urls = list(_iter_css_urls(attribs['style']))
449 if urls:
450
451 for match in urls[::-1]:
452 url, start = _unquote_match(match.group(1), match.start(1))
453 yield (el, 'style', url, start)
454
455 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
456 base_href=None):
457 """
458 Rewrite all the links in the document. For each link
459 ``link_repl_func(link)`` will be called, and the return value
460 will replace the old link.
461
462 Note that links may not be absolute (unless you first called
463 ``make_links_absolute()``), and may be internal (e.g.,
464 ``'#anchor'``). They can also be values like
465 ``'mailto:email'`` or ``'javascript:expr'``.
466
467 If you give ``base_href`` then all links passed to
468 ``link_repl_func()`` will take that into account.
469
470 If the ``link_repl_func`` returns None, the attribute or
471 tag text will be removed completely.
472 """
473 if base_href is not None:
474
475
476 self.make_links_absolute(
477 base_href, resolve_base_href=resolve_base_href)
478 elif resolve_base_href:
479 self.resolve_base_href()
480
481 for el, attrib, link, pos in self.iterlinks():
482 new_link = link_repl_func(link.strip())
483 if new_link == link:
484 continue
485 if new_link is None:
486
487 if attrib is None:
488 el.text = ''
489 else:
490 del el.attrib[attrib]
491 continue
492
493 if attrib is None:
494 new = el.text[:pos] + new_link + el.text[pos+len(link):]
495 el.text = new
496 else:
497 cur = el.get(attrib)
498 if not pos and len(cur) == len(link):
499 new = new_link
500 else:
501 new = cur[:pos] + new_link + cur[pos+len(link):]
502 el.set(attrib, new)
503
504
506 """
507 An object that represents a method on an element as a function;
508 the function takes either an element or an HTML string. It
509 returns whatever the function normally returns, or if the function
510 works in-place (and so returns None) it returns a serialized form
511 of the resulting document.
512 """
518 result_type = type(doc)
519 if isinstance(doc, basestring):
520 if 'copy' in kw:
521 raise TypeError(
522 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
523 doc = fromstring(doc, **kw)
524 else:
525 if 'copy' in kw:
526 make_a_copy = kw.pop('copy')
527 else:
528 make_a_copy = self.copy
529 if make_a_copy:
530 doc = copy.deepcopy(doc)
531 meth = getattr(doc, self.name)
532 result = meth(*args, **kw)
533
534 if result is None:
535
536 return _transform_result(result_type, doc)
537 else:
538 return result
539
540 find_rel_links = _MethodFunc('find_rel_links', copy=False)
541 find_class = _MethodFunc('find_class', copy=False)
542 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
543 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
544 iterlinks = _MethodFunc('iterlinks', copy=False)
545 rewrite_links = _MethodFunc('rewrite_links', copy=True)
546
549
552
555
558
559
561 """A lookup scheme for HTML Element classes.
562
563 To create a lookup instance with different Element classes, pass a tag
564 name mapping of Element classes in the ``classes`` keyword argument and/or
565 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
566 The special key '*' denotes a Mixin class that should be mixed into all
567 Element classes.
568 """
569 _default_element_classes = {}
570
571 - def __init__(self, classes=None, mixins=None):
588
589 - def lookup(self, node_type, document, namespace, name):
600
601
602
603
604
605 _looks_like_full_html_unicode = re.compile(
606 unicode(r'^\s*<(?:html|!doctype)'), re.I).match
607 _looks_like_full_html_bytes = re.compile(
608 r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
609
622
625 """
626 Parses several HTML elements, returning a list of elements.
627
628 The first item in the list may be a string (though leading
629 whitespace is removed). If no_leading_text is true, then it will
630 be an error if there is leading text, and it will always be a list
631 of only elements.
632
633 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
634 """
635 if parser is None:
636 parser = html_parser
637
638 if isinstance(html, bytes):
639 if not _looks_like_full_html_bytes(html):
640
641 html = ('<html><body>'.encode('ascii') + html +
642 '</body></html>'.encode('ascii'))
643 else:
644 if not _looks_like_full_html_unicode(html):
645 html = '<html><body>%s</body></html>' % html
646 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
647 assert _nons(doc.tag) == 'html'
648 bodies = [e for e in doc if _nons(e.tag) == 'body']
649 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
650 body = bodies[0]
651 elements = []
652 if no_leading_text and body.text and body.text.strip():
653 raise etree.ParserError(
654 "There is leading text: %r" % body.text)
655 if body.text and body.text.strip():
656 elements.append(body.text)
657 elements.extend(body)
658
659
660 return elements
661
664 """
665 Parses a single HTML element; it is an error if there is more than
666 one element, or if anything but whitespace precedes or follows the
667 element.
668
669 If ``create_parent`` is true (or is a tag name) then a parent node
670 will be created to encapsulate the HTML in a single element. In this
671 case, leading or trailing text is also allowed, as are multiple elements
672 as result of the parsing.
673
674 Passing a ``base_url`` will set the document's ``base_url`` attribute
675 (and the tree's docinfo.URL).
676 """
677 if parser is None:
678 parser = html_parser
679
680 accept_leading_text = bool(create_parent)
681
682 elements = fragments_fromstring(
683 html, parser=parser, no_leading_text=not accept_leading_text,
684 base_url=base_url, **kw)
685
686 if create_parent:
687 if not isinstance(create_parent, basestring):
688 create_parent = 'div'
689 new_root = Element(create_parent)
690 if elements:
691 if isinstance(elements[0], basestring):
692 new_root.text = elements[0]
693 del elements[0]
694 new_root.extend(elements)
695 return new_root
696
697 if not elements:
698 raise etree.ParserError('No elements found')
699 if len(elements) > 1:
700 raise etree.ParserError(
701 "Multiple elements found (%s)"
702 % ', '.join([_element_name(e) for e in elements]))
703 el = elements[0]
704 if el.tail and el.tail.strip():
705 raise etree.ParserError(
706 "Element followed by text: %r" % el.tail)
707 el.tail = None
708 return el
709
710 -def fromstring(html, base_url=None, parser=None, **kw):
776
777 -def parse(filename_or_url, parser=None, base_url=None, **kw):
778 """
779 Parse a filename, URL, or file-like object into an HTML document
780 tree. Note: this returns a tree, not an element. Use
781 ``parse(...).getroot()`` to get the document root.
782
783 You can override the base URL with the ``base_url`` keyword. This
784 is most useful when parsing from a file-like object.
785 """
786 if parser is None:
787 parser = html_parser
788 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
789
797
799 if isinstance(el, etree.CommentBase):
800 return 'comment'
801 elif isinstance(el, basestring):
802 return 'string'
803 else:
804 return _nons(el.tag)
805
806
807
808
809
914
915 HtmlElementClassLookup._default_element_classes['form'] = FormElement
916
953
955 if not url:
956 raise ValueError("cannot submit, no URL provided")
957
958 try:
959 from urllib import urlencode, urlopen
960 except ImportError:
961 from urllib.request import urlopen
962 from urllib.parse import urlencode
963 if method == 'GET':
964 if '?' in url:
965 url += '&'
966 else:
967 url += '?'
968 url += urlencode(values)
969 data = None
970 else:
971 data = urlencode(values)
972 return urlopen(url, data)
973
975
983 raise KeyError(
984 "You cannot remove keys from ElementDict")
988 return item in self.inputs
993
995 return '<%s for form %s>' % (
996 self.__class__.__name__,
997 self.inputs.form._name())
998
1064
1092
1093 -class TextareaElement(InputMixin, HtmlElement):
1094 """
1095 ``<textarea>`` element. You can get the name with ``.name`` and
1096 get/set the value with ``.value``
1097 """
1098
1099 - def _value__get(self):
1100 """
1101 Get/set the value (which is the contents of this element)
1102 """
1103 content = self.text or ''
1104 if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
1105 serialisation_method = 'xml'
1106 else:
1107 serialisation_method = 'html'
1108 for el in self:
1109
1110 content += etree.tostring(
1111 el, method=serialisation_method, encoding='unicode')
1112 return content
1113 - def _value__set(self, value):
1114 del self[:]
1115 self.text = value
1116 - def _value__del(self):
1117 self.text = ''
1118 del self[:]
1119 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1120
1121 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
1122
1124 """
1125 ``<select>`` element. You can get the name with ``.name``.
1126
1127 ``.value`` will be the value of the selected option, unless this
1128 is a multi-select element (``<select multiple>``), in which case
1129 it will be a set-like object. In either case ``.value_options``
1130 gives the possible values.
1131
1132 The boolean attribute ``.multiple`` shows if this is a
1133 multi-select.
1134 """
1135
1137 """
1138 Get/set the value of this select (the selected option).
1139
1140 If this is a multi-select, this is a set-like object that
1141 represents all the selected options.
1142 """
1143 if self.multiple:
1144 return MultipleSelectOptions(self)
1145 for el in _options_xpath(self):
1146 if el.get('selected') is not None:
1147 value = el.get('value')
1148 if value is None:
1149 value = el.text or ''
1150 if value:
1151 value = value.strip()
1152 return value
1153 return None
1154
1156 if self.multiple:
1157 if isinstance(value, basestring):
1158 raise TypeError(
1159 "You must pass in a sequence")
1160 self.value.clear()
1161 self.value.update(value)
1162 return
1163 if value is not None:
1164 value = value.strip()
1165 for el in _options_xpath(self):
1166 opt_value = el.get('value')
1167 if opt_value is None:
1168 opt_value = el.text or ''
1169 if opt_value:
1170 opt_value = opt_value.strip()
1171 if opt_value == value:
1172 checked_option = el
1173 break
1174 else:
1175 raise ValueError(
1176 "There is no option with the value of %r" % value)
1177 for el in _options_xpath(self):
1178 if 'selected' in el.attrib:
1179 del el.attrib['selected']
1180 if value is not None:
1181 checked_option.set('selected', '')
1182
1189
1190 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1191
1206 value_options = property(value_options, doc=value_options.__doc__)
1207
1209 """
1210 Boolean attribute: is there a ``multiple`` attribute on this element.
1211 """
1212 return 'multiple' in self.attrib
1214 if value:
1215 self.set('multiple', '')
1216 elif 'multiple' in self.attrib:
1217 del self.attrib['multiple']
1218 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1219
1220 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
1221
1223 """
1224 Represents all the selected options in a ``<select multiple>`` element.
1225
1226 You can add to this set-like option to select an option, or remove
1227 to unselect the option.
1228 """
1229
1231 self.select = select
1232
1234 """
1235 Iterator of all the ``<option>`` elements.
1236 """
1237 return iter(_options_xpath(self.select))
1238 options = property(options)
1239
1241 for option in self.options:
1242 if 'selected' in option.attrib:
1243 opt_value = option.get('value')
1244 if opt_value is None:
1245 opt_value = option.text or ''
1246 if opt_value:
1247 opt_value = opt_value.strip()
1248 yield opt_value
1249
1250 - def add(self, item):
1251 for option in self.options:
1252 opt_value = option.get('value')
1253 if opt_value is None:
1254 opt_value = option.text or ''
1255 if opt_value:
1256 opt_value = opt_value.strip()
1257 if opt_value == item:
1258 option.set('selected', '')
1259 break
1260 else:
1261 raise ValueError(
1262 "There is no option with the value %r" % item)
1263
1265 for option in self.options:
1266 opt_value = option.get('value')
1267 if opt_value is None:
1268 opt_value = option.text or ''
1269 if opt_value:
1270 opt_value = opt_value.strip()
1271 if opt_value == item:
1272 if 'selected' in option.attrib:
1273 del option.attrib['selected']
1274 else:
1275 raise ValueError(
1276 "The option %r is not currently selected" % item)
1277 break
1278 else:
1279 raise ValueError(
1280 "There is not option with the value %r" % item)
1281
1283 return '<%s {%s} for select name=%r>' % (
1284 self.__class__.__name__,
1285 ', '.join([repr(v) for v in self]),
1286 self.select.name)
1287
1289 """
1290 This object represents several ``<input type=radio>`` elements
1291 that have the same name.
1292
1293 You can use this like a list, but also use the property
1294 ``.value`` to check/uncheck inputs. Also you can use
1295 ``.value_options`` to get the possible values.
1296 """
1297
1299 """
1300 Get/set the value, which checks the radio with that value (and
1301 unchecks any other value).
1302 """
1303 for el in self:
1304 if 'checked' in el.attrib:
1305 return el.get('value')
1306 return None
1307
1309 if value is not None:
1310 for el in self:
1311 if el.get('value') == value:
1312 checked_option = el
1313 break
1314 else:
1315 raise ValueError(
1316 "There is no radio input with the value %r" % value)
1317 for el in self:
1318 if 'checked' in el.attrib:
1319 del el.attrib['checked']
1320 if value is not None:
1321 checked_option.set('checked', '')
1322
1325
1326 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1327
1329 """
1330 Returns a list of all the possible values.
1331 """
1332 return [el.get('value') for el in self]
1333 value_options = property(value_options, doc=value_options.__doc__)
1334
1336 return '%s(%s)' % (
1337 self.__class__.__name__,
1338 list.__repr__(self))
1339
1341 """
1342 Represents a group of checkboxes (``<input type=checkbox>``) that
1343 have the same name.
1344
1345 In addition to using this like a list, the ``.value`` attribute
1346 returns a set-like object that you can add to or remove from to
1347 check and uncheck checkboxes. You can also use ``.value_options``
1348 to get the possible values.
1349 """
1350
1352 """
1353 Return a set-like object that can be modified to check or
1354 uncheck individual checkboxes according to their value.
1355 """
1356 return CheckboxValues(self)
1366 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1367
1369 """
1370 Returns a list of all the possible values.
1371 """
1372 return [el.get('value') for el in self]
1373 value_options = property(value_options, doc=value_options.__doc__)
1374
1376 return '%s(%s)' % (
1377 self.__class__.__name__, list.__repr__(self))
1378
1380
1381 """
1382 Represents the values of the checked checkboxes in a group of
1383 checkboxes with the same name.
1384 """
1385
1388
1390 return iter([
1391 el.get('value')
1392 for el in self.group
1393 if 'checked' in el.attrib])
1394
1395 - def add(self, value):
1396 for el in self.group:
1397 if el.get('value') == value:
1398 el.set('checked', '')
1399 break
1400 else:
1401 raise KeyError("No checkbox with value %r" % value)
1402
1404 for el in self.group:
1405 if el.get('value') == value:
1406 if 'checked' in el.attrib:
1407 del el.attrib['checked']
1408 else:
1409 raise KeyError(
1410 "The checkbox with value %r was already unchecked" % value)
1411 break
1412 else:
1413 raise KeyError(
1414 "No checkbox with value %r" % value)
1415
1417 return '<%s {%s} for checkboxes name=%r>' % (
1418 self.__class__.__name__,
1419 ', '.join([repr(v) for v in self]),
1420 self.group.name)
1421
1505
1506 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1507
1509 """
1510 Represents a ``<label>`` element.
1511
1512 Label elements are linked to other elements with their ``for``
1513 attribute. You can access this element with ``label.for_element``.
1514 """
1515
1517 """
1518 Get/set the element this label points to. Return None if it
1519 can't be found.
1520 """
1521 id = self.get('for')
1522 if not id:
1523 return None
1524 return self.body.get_element_by_id(id)
1526 id = other.get('id')
1527 if not id:
1528 raise TypeError(
1529 "Element %r has no id attribute" % other)
1530 self.set('for', id)
1534 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1535 doc=_for_element__get.__doc__)
1536
1537 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1538
1539
1540
1541
1542
1556
1558 """Convert all tags in an XHTML tree to HTML by removing their
1559 XHTML namespace.
1560 """
1561 try:
1562 xhtml = xhtml.getroot()
1563 except AttributeError:
1564 pass
1565 prefix = "{%s}" % XHTML_NAMESPACE
1566 prefix_len = len(prefix)
1567 for el in xhtml.iter(prefix + "*"):
1568 el.tag = el.tag[prefix_len:]
1569
1570
1571
1572 __str_replace_meta_content_type = re.compile(
1573 r'<meta http-equiv="Content-Type"[^>]*>').sub
1574 __bytes_replace_meta_content_type = re.compile(
1575 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
1576
1577 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1578 encoding=None, method="html", with_tail=True, doctype=None):
1579 """Return an HTML string representation of the document.
1580
1581 Note: if include_meta_content_type is true this will create a
1582 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1583 regardless of the value of include_meta_content_type any existing
1584 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1585
1586 The ``encoding`` argument controls the output encoding (defauts to
1587 ASCII, with &#...; character references for any characters outside
1588 of ASCII). Note that you can pass the name ``'unicode'`` as
1589 ``encoding`` argument to serialise to a Unicode string.
1590
1591 The ``method`` argument defines the output method. It defaults to
1592 'html', but can also be 'xml' for xhtml output, or 'text' to
1593 serialise to plain text without markup.
1594
1595 To leave out the tail text of the top-level element that is being
1596 serialised, pass ``with_tail=False``.
1597
1598 The ``doctype`` option allows passing in a plain string that will
1599 be serialised before the XML tree. Note that passing in non
1600 well-formed content here will make the XML output non well-formed.
1601 Also, an existing doctype in the document tree will not be removed
1602 when serialising an ElementTree instance.
1603
1604 Example::
1605
1606 >>> from lxml import html
1607 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1608
1609 >>> html.tostring(root)
1610 b'<p>Hello<br>world!</p>'
1611 >>> html.tostring(root, method='html')
1612 b'<p>Hello<br>world!</p>'
1613
1614 >>> html.tostring(root, method='xml')
1615 b'<p>Hello<br/>world!</p>'
1616
1617 >>> html.tostring(root, method='text')
1618 b'Helloworld!'
1619
1620 >>> html.tostring(root, method='text', encoding='unicode')
1621 u'Helloworld!'
1622
1623 >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
1624 >>> html.tostring(root[0], method='text', encoding='unicode')
1625 u'Helloworld!TAIL'
1626
1627 >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
1628 u'Helloworld!'
1629
1630 >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
1631 >>> html.tostring(doc, method='html', encoding='unicode')
1632 u'<html><body><p>Hello<br>world!</p></body></html>'
1633
1634 >>> print(html.tostring(doc, method='html', encoding='unicode',
1635 ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
1636 ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
1637 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
1638 <html><body><p>Hello<br>world!</p></body></html>
1639 """
1640 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1641 encoding=encoding, with_tail=with_tail,
1642 doctype=doctype)
1643 if method == 'html' and not include_meta_content_type:
1644 if isinstance(html, str):
1645 html = __str_replace_meta_content_type('', html)
1646 else:
1647 html = __bytes_replace_meta_content_type(bytes(), html)
1648 return html
1649
1650 tostring.__doc__ = __fix_docstring(tostring.__doc__)
1651
1653 """
1654 Open the HTML document in a web browser, saving it to a temporary
1655 file to open it. Note that this does not delete the file after
1656 use. This is mainly meant for debugging.
1657 """
1658 import os
1659 import webbrowser
1660 import tempfile
1661 if not isinstance(doc, etree._ElementTree):
1662 doc = etree.ElementTree(doc)
1663 handle, fn = tempfile.mkstemp(suffix='.html')
1664 f = os.fdopen(handle, 'wb')
1665 try:
1666 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
1667 finally:
1668
1669 f.close()
1670 url = 'file://' + fn.replace(os.path.sep, '/')
1671 print(url)
1672 webbrowser.open(url)
1673
1674
1675
1676
1677
1679 """An HTML parser that is configured to return lxml.html Element
1680 objects.
1681 """
1685
1687 """An XML parser that is configured to return lxml.html Element
1688 objects.
1689
1690 Note that this parser is not really XHTML aware unless you let it
1691 load a DTD that declares the HTML entities. To do this, make sure
1692 you have the XHTML DTDs installed in your catalogs, and create the
1693 parser like this::
1694
1695 >>> parser = XHTMLParser(load_dtd=True)
1696
1697 If you additionally want to validate the document, use this::
1698
1699 >>> parser = XHTMLParser(dtd_validation=True)
1700
1701 For catalog support, see http://www.xmlsoft.org/catalog.html.
1702 """
1706
1708 """Create a new HTML Element.
1709
1710 This can also be used for XHTML documents.
1711 """
1712 v = html_parser.makeelement(*args, **kw)
1713 return v
1714
1715 html_parser = HTMLParser()
1716 xhtml_parser = XHTMLParser()
1717