aboutsummaryrefslogtreecommitdiffstats
path: root/lib/bs4/testing.py
diff options
context:
space:
mode:
Diffstat (limited to 'lib/bs4/testing.py')
-rw-r--r--lib/bs4/testing.py129
1 files changed, 112 insertions, 17 deletions
diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py
index fd4495ac5..3a2f260e2 100644
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@@ -1,5 +1,8 @@
"""Helper classes for tests."""
+__license__ = "MIT"
+
+import pickle
import copy
import functools
import unittest
@@ -43,6 +46,16 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+ def assertConnectedness(self, element):
+ """Ensure that next_element and previous_element are properly
+ set for all descendants of the given element.
+ """
+ earlier = None
+ for e in element.descendants:
+ if earlier:
+ self.assertEqual(e, earlier.next_element)
+ self.assertEqual(earlier, e.previous_element)
+ earlier = e
class HTMLTreeBuilderSmokeTest(object):
@@ -54,6 +67,15 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation.
"""
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("<a><b>foo</a>")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -114,6 +136,11 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
+ def test_processing_instruction(self):
+ markup = b"""<?PITarget PIContent?>"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
@@ -155,6 +182,23 @@ class HTMLTreeBuilderSmokeTest(object):
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
+ def test_double_head(self):
+ html = '''<!DOCTYPE html>
+<html>
+<head>
+<title>Ordinary HEAD element test</title>
+</head>
+<script type="text/javascript">
+alert("Help!");
+</script>
+<body>
+Hello, world!
+</body>
+</html>
+'''
+ soup = self.soup(html)
+ self.assertEqual("text/javascript", soup.find('script')['type'])
+
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
@@ -221,18 +265,26 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
+ def test_multivalued_attribute_on_html(self):
+ # html5lib uses a different API to set the attributes ot the
+ # <html> tag. This has caused problems with multivalued
+ # attributes.
+ markup = '<html class="a b"></html>'
+ soup = self.soup(markup)
+ self.assertEqual(["a", "b"], soup.html['class'])
+
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
- expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+ expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
- expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+ expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@@ -243,7 +295,7 @@ class HTMLTreeBuilderSmokeTest(object):
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
- expect = u"\N{REPLACEMENT CHARACTER}"
+ expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
@@ -253,6 +305,35 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
+ self.assertConnectedness(soup)
+
+ def test_head_tag_between_head_and_body(self):
+ "Prevent recurrence of a bug in the html5lib treebuilder."
+ content = """<html><head></head>
+ <link></link>
+ <body>foo</body>
+</html>
+"""
+ soup = self.soup(content)
+ self.assertNotEqual(None, soup.html.body)
+ self.assertConnectedness(soup)
+
+ def test_multiple_copies_of_a_tag(self):
+ "Prevent recurrence of a bug in the html5lib treebuilder."
+ content = """<!DOCTYPE html>
+<html>
+ <body>
+ <article id="a" >
+ <div><a href="1"></div>
+ <footer>
+ <a href="2"></a>
+ </footer>
+ </article>
+ </body>
+</html>
+"""
+ soup = self.soup(content)
+ self.assertConnectedness(soup.article)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
@@ -285,9 +366,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
- markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+ markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@@ -327,7 +408,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
- expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+ expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@@ -337,15 +418,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
- expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+ expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@@ -354,7 +435,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+ unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@@ -463,11 +544,25 @@ class HTMLTreeBuilderSmokeTest(object):
class XMLTreeBuilderSmokeTest(object):
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("<a><b>foo</a>")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
+ def test_xml_declaration(self):
+ markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
@@ -485,7 +580,7 @@ class XMLTreeBuilderSmokeTest(object):
<script type="text/javascript">
</script>
"""
- soup = BeautifulSoup(doc, "xml")
+ soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
@@ -493,15 +588,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
- markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+ markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
- unicode(soup.rss), markup)
+ str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
@@ -532,17 +627,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.p), markup)
+ self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""