diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2014-12-19 08:14:13 -0800 | 
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2014-12-19 08:14:36 -0800 | 
| commit | b28c97c9b8af266d4f12deb5febcf28807d9f5c6 (patch) | |
| tree | 264557669682ec42dc1d8c48fe9de65e57ace733 | |
| parent | b5f809582e073a3b4cb31a167e03f18145a04249 (diff) | |
Added a few more doctests for HTML normalization (#245).
| -rw-r--r-- | test/normalize.py | 22 | 
1 files changed, 21 insertions, 1 deletions
| diff --git a/test/normalize.py b/test/normalize.py index 4b922e6..5b4803b 100644 --- a/test/normalize.py +++ b/test/normalize.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*-  from HTMLParser import HTMLParser, HTMLParseError  from htmlentitydefs import name2codepoint  import sys @@ -118,14 +119,33 @@ def normalize_html(html):      Multiple inner whitespaces are collapsed to a single space (except      in pre tags): +        >>> normalize_html("<p>a  \t b</p>") +        u'<p>a b</p>' +          >>> normalize_html("<p>a  \t\nb</p>")          u'<p>a b</p>'      * Outer whitespace (outside block-level tags) is removed. + +        >>> normalize_html("<p>a  b</p>  ") +        u'<p>a b</p>' +      * Self-closing tags are converted to open tags. + +        >>> normalize_html("<br />") +        u'<br>' +      * Attributes are sorted and lowercased. + +        >>> normalize_html('<a title="bar" HREF="foo">x</a>') +        u'<a href="foo" title="bar">x</a>' +      * References are converted to unicode, except that '<', '>', '&', and -      '&' are rendered using entities. +      '"' are rendered using entities. + +        >>> normalize_html("∀&><"") +        u'\u2200&><"' +      """      html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")      try: | 
