Source code for bag.html

"""Deal with HTML markup."""

import re
from html.entities import name2codepoint


[docs]def encode_xml_char_refs(s): # http://mail.python.org/pipermail/python-list/2007-January/424262.html return s.encode("ascii", "xmlcharrefreplace")
def _substitute_entity(match): ent = match.group(3) if match.group(1) == "#": # decode by number if match.group(2) == "": # number is decimal return chr(int(ent)) elif match.group(2) == "x": # number is hex return chr(int("0x" + ent, 16)) else: cp = name2codepoint.get(ent) # decode by name if cp: return chr(cp) else: return match.group()
[docs]def decode_entities(txt): return entity_re.subn(_substitute_entity, txt)[0]
entity_re = re.compile(r"&(#?)(x?)(\w+);", flags=re.IGNORECASE)
[docs]def html_to_unicode(html): html = ( decode_entities(html) .replace("\r\n", " ") .replace("\r", " ") .replace("\n", " ") .replace("\t", " ") .replace("<br />", "\n") .replace("<br>", "\n") .strip() ) regex = re.compile(r" {2,999}") return regex.sub(" ", html)