1
2
3 """
4 HTML parser test cases for etree
5 """
6
7 import unittest
8 import tempfile, os, os.path, sys
9
10 this_dir = os.path.dirname(__file__)
11 if this_dir not in sys.path:
12 sys.path.insert(0, this_dir)
13
14 from common_imports import etree, StringIO, BytesIO, fileInTestDir, _bytes, _str
15 from common_imports import SillyFileLike, HelperTestCase, write_to_file, next
16
17 try:
18 unicode
19 except NameError:
20 unicode = str
21
23 """HTML parser test cases
24 """
25 etree = etree
26
27 html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
28 html_str_pretty = _bytes("""\
29 <html>
30 <head><title>test</title></head>
31 <body><h1>page title</h1></body>
32 </html>
33 """)
34 broken_html_str = _bytes("<html><head><title>test"
35 "<body><h1>page title</h3></p></html>")
36 uhtml_str = _bytes(
37 "<html><head><title>test á</title></head>"
38 "<body><h1>page á title</h1></body></html>").decode('utf8')
39
43
48
56
58 if sys.maxunicode < 1114111:
59 return
60 element = self.etree.HTML(_bytes(
61 '<html><body><p>\\U00026007</p></body></html>'
62 ).decode('unicode_escape'))
63 p_text = element.findtext('.//p')
64 self.assertEqual(1, len(p_text))
65 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
66 p_text)
67
72
79
81 parser = self.etree.HTMLParser()
82 Element = parser.makeelement
83
84 el = Element('name')
85 self.assertRaises(ValueError, Element, '{}')
86 self.assertRaises(ValueError, setattr, el, 'tag', '{}')
87
88 self.assertRaises(ValueError, Element, '{test}')
89 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
90
104
106 parser = self.etree.HTMLParser()
107 Element = parser.makeelement
108
109 self.assertRaises(ValueError, Element, 'p"name')
110 self.assertRaises(ValueError, Element, "na'me")
111 self.assertRaises(ValueError, Element, '{test}"name')
112 self.assertRaises(ValueError, Element, "{test}name'")
113
114 el = Element('name')
115 self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
116 self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
117 self.assertEqual(el.tag, "name")
118
120 parser = self.etree.HTMLParser()
121 Element = parser.makeelement
122
123 self.assertRaises(ValueError, Element, ' name ')
124 self.assertRaises(ValueError, Element, 'na me')
125 self.assertRaises(ValueError, Element, '{test} name')
126
127 el = Element('name')
128 self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
129 self.assertEqual(el.tag, "name")
130
140
152
154 parser = self.etree.HTMLParser()
155 Element = parser.makeelement
156 SubElement = self.etree.SubElement
157
158 el = Element('name')
159 self.assertRaises(ValueError, SubElement, el, "name'")
160 self.assertRaises(ValueError, SubElement, el, 'na"me')
161 self.assertRaises(ValueError, SubElement, el, "{test}na'me")
162 self.assertRaises(ValueError, SubElement, el, '{test}"name')
163
173
180
190
192 text = _str('Søk på nettet')
193 wrong_head = _str('''
194 <head>
195 <meta http-equiv="Content-Type"
196 content="text/html; charset=UTF-8" />
197 </head>''')
198 html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
199 text)
200 ).encode('iso-8859-1')
201
202 self.assertRaises(self.etree.ParseError,
203 self.etree.parse,
204 BytesIO(html_latin1))
205
206 tree = self.etree.parse(
207 BytesIO(html_latin1),
208 self.etree.HTMLParser(encoding="iso-8859-1"))
209 p = tree.find("//p")
210 self.assertEqual(p.text, text)
211
216
218
219 html = _bytes('<html><head><style>foo</style></head></html>')
220 element = self.etree.HTML(html)
221 self.assertEqual(element[0][0].text, "foo")
222
226
239
247
248
249
250
251
252
253
254
261
276
278 iterparse = self.etree.iterparse
279 f = BytesIO(
280 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
281
282 iterator = iterparse(f, html=True)
283 self.assertEqual(None, iterator.root)
284
285 events = list(iterator)
286 root = iterator.root
287 self.assertTrue(root is not None)
288 self.assertEqual(
289 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
290 ('end', root[1]), ('end', root)],
291 events)
292
294 iterparse = self.etree.iterparse
295 f = BytesIO(
296 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
297
298 iterator = iterparse(f, html=True)
299 self.assertEqual(None, iterator.root)
300
301 event, element = next(iterator)
302 self.assertEqual('end', event)
303 self.assertEqual('title', element.tag)
304 self.assertEqual(None, iterator.root)
305 del element
306
307 event, element = next(iterator)
308 self.assertEqual('end', event)
309 self.assertEqual('head', element.tag)
310 self.assertEqual(None, iterator.root)
311 del element
312 del iterator
313
315 iterparse = self.etree.iterparse
316 f = BytesIO('<head><title>TEST></head><p>P<br></div>')
317
318 iterator = iterparse(f, html=True)
319 self.assertEqual(None, iterator.root)
320
321 events = list(iterator)
322 root = iterator.root
323 self.assertTrue(root is not None)
324 self.assertEqual('html', root.tag)
325 self.assertEqual('head', root[0].tag)
326 self.assertEqual('body', root[1].tag)
327 self.assertEqual('p', root[1][0].tag)
328 self.assertEqual('br', root[1][0][0].tag)
329 self.assertEqual(
330 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
331 ('end', root[1][0]), ('end', root[1]), ('end', root)],
332 events)
333
339
341 iterparse = self.etree.iterparse
342 iterator = iterparse(fileInTestDir("shakespeare.html"),
343 html=True)
344
345 self.assertEqual(None, iterator.root)
346 events = list(iterator)
347 root = iterator.root
348 self.assertTrue(root is not None)
349 self.assertEqual(249, len(events))
350 self.assertEqual(
351 [],
352 [ event for (event, element) in events if event != 'end' ])
353
355 iterparse = self.etree.iterparse
356 f = BytesIO(
357 '<html><head><title>TITLE</title><body><p>P</p></body></html>')
358
359 iterator = iterparse(f, html=True, events=('start',))
360 self.assertEqual(None, iterator.root)
361
362 events = list(iterator)
363 root = iterator.root
364 self.assertTrue(root is not None)
365 self.assertEqual(
366 [('start', root), ('start', root[0]), ('start', root[0][0]),
367 ('start', root[1]), ('start', root[1][0])],
368 events)
369
377 def end(self, tag):
378 events.append(("end", tag))
379 def close(self):
380 return "DONE"
381
382 parser = self.etree.HTMLParser(target=Target())
383
384 parser.feed("<html><body></body></html>")
385 done = parser.close()
386
387 self.assertEqual("DONE", done)
388 self.assertEqual([
389 ("start", "html"), ("start", "body"),
390 ("end", "body"), ("end", "html")], events)
391
399 def end(self, tag):
400 events.append(("end", tag))
401 def doctype(self, *args):
402 events.append(("doctype", args))
403 def close(self):
404 return "DONE"
405
406 parser = self.etree.HTMLParser(target=Target())
407 parser.feed("<!DOCTYPE><html><body></body></html>")
408 done = parser.close()
409
410 self.assertEqual("DONE", done)
411 self.assertEqual([
412 ("doctype", (None, None, None)),
413 ("start", "html"), ("start", "body"),
414 ("end", "body"), ("end", "html")], events)
415
423 def end(self, tag):
424 events.append(("end", tag))
425 def doctype(self, *args):
426 events.append(("doctype", args))
427 def close(self):
428 return "DONE"
429
430 parser = self.etree.HTMLParser(target=Target())
431 parser.feed("<!DOCTYPE html><html><body></body></html>")
432 done = parser.close()
433
434 self.assertEqual("DONE", done)
435 self.assertEqual([
436 ("doctype", ("html", None, None)),
437 ("start", "html"), ("start", "body"),
438 ("end", "body"), ("end", "html")], events)
439
447 def end(self, tag):
448 events.append(("end", tag))
449 def doctype(self, *args):
450 events.append(("doctype", args))
451 def close(self):
452 return "DONE"
453
454 parser = self.etree.HTMLParser(target=Target())
455 parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
456 '<html><body></body></html>')
457 done = parser.close()
458
459 self.assertEqual("DONE", done)
460 self.assertEqual([
461 ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
462 ("start", "html"), ("start", "body"),
463 ("end", "body"), ("end", "html")], events)
464
465
467 suite = unittest.TestSuite()
468 suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
469 return suite
470
471 if __name__ == '__main__':
472 print('to test use test.py %s' % __file__)
473