Source code for bag.html

"""Deal with HTML markup."""

import re
from html.entities import name2codepoint


[docs]def encode_xml_char_refs(s):
    # http://mail.python.org/pipermail/python-list/2007-January/424262.html
    return s.encode("ascii", "xmlcharrefreplace")


def _substitute_entity(match):
    ent = match.group(3)
    if match.group(1) == "#":  # decode by number
        if match.group(2) == "":  # number is decimal
            return chr(int(ent))
        elif match.group(2) == "x":  # number is hex
            return chr(int("0x" + ent, 16))
    else:
        cp = name2codepoint.get(ent)  # decode by name
        if cp:
            return chr(cp)
        else:
            return match.group()


[docs]def decode_entities(txt):
    return entity_re.subn(_substitute_entity, txt)[0]


entity_re = re.compile(r"&(#?)(x?)(\w+);", flags=re.IGNORECASE)


[docs]def html_to_unicode(html):
    html = (
        decode_entities(html)
        .replace("\r\n", " ")
        .replace("\r", " ")
        .replace("\n", " ")
        .replace("\t", " ")
        .replace("<br />", "\n")
        .replace("<br>", "\n")
        .strip()
    )
    regex = re.compile(r" {2,999}")
    return regex.sub(" ", html)