Source code for bag.html
"""Deal with HTML markup."""
import re
from html.entities import name2codepoint
[docs]def encode_xml_char_refs(s):
# http://mail.python.org/pipermail/python-list/2007-January/424262.html
return s.encode("ascii", "xmlcharrefreplace")
def _substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#": # decode by number
if match.group(2) == "": # number is decimal
return chr(int(ent))
elif match.group(2) == "x": # number is hex
return chr(int("0x" + ent, 16))
else:
cp = name2codepoint.get(ent) # decode by name
if cp:
return chr(cp)
else:
return match.group()
[docs]def decode_entities(txt):
return entity_re.subn(_substitute_entity, txt)[0]
entity_re = re.compile(r"&(#?)(x?)(\w+);", flags=re.IGNORECASE)
[docs]def html_to_unicode(html):
html = (
decode_entities(html)
.replace("\r\n", " ")
.replace("\r", " ")
.replace("\n", " ")
.replace("\t", " ")
.replace("<br />", "\n")
.replace("<br>", "\n")
.strip()
)
regex = re.compile(r" {2,999}")
return regex.sub(" ", html)