Source code for bag.text

"""Functions to manipulate strings."""

import codecs
import random
import os
import re
from datetime import datetime
from pathlib import Path
from typing import Callable, Generator, List, Optional, Tuple  # noqa


[docs]def parse_iso_date(txt: str) -> datetime:
    """Parse a datetime in ISO format."""
    return datetime.strptime(txt[:19], "%Y-%m-%d %H:%M:%S")


[docs]def shorten(txt: str, length: int = 10, ellipsis: str = "…") -> str:
    """Truncate ``txt``, adding ``ellipsis`` to end, with total ``length``."""
    if len(txt) > length:
        return txt[: length - len(ellipsis)] + ellipsis
    else:
        return txt


[docs]def shorten_proper(
    name: str, length: int = 11, ellipsis: str = "…", min: int = None
) -> str:
    """Shorten a proper name for displaying."""
    min = min or int(length / 2.0)
    words = name.split(" ")
    output = []  # type: List[str]
    ln = -1
    while words:
        word = words.pop(0)
        ln += len(word) + 1
        if ln > length:
            break
        output.append(word)
    short = " ".join(output)
    return (
        short
        if short and len(short) >= min
        else shorten(name, length=length, ellipsis=ellipsis)
    )


[docs]def uncommafy(txt: str, sep: str = ",") -> Generator[str, None, None]:
    """Generate the elements of a comma-separated string.

    Takes a comma-delimited string and returns a generator of
    stripped strings. No empty string is yielded.
    """
    for item in txt.split(sep):
        item = item.strip()
        if item:
            yield item


[docs]def random_string(
    length: int,
    chars: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
) -> str:
    """Return a random string of some `length`."""
    return "".join((random.choice(chars) for i in range(length)))


latin1_map = (
    ('"', "“”"),
    ("-", "\u2013\u2014\u2022"),
    ("'", "\u2018\u2019"),
    ("", "\ufffd\u2122\u2020"),
    ("...", "\u2026"),
    ("i", "\u012b"),
    ("ã", "\u0101"),
    ("r", "\u0159"),
    ("Z", "\u017d"),
    ("z", "\u017e"),
    ("EUR", "\u20ac"),
)  # chars that ISO-8859-1 does not support

ascii_map = (
    ("a", "áàâãäå\u0101"),
    ("e", "éèêẽë"),
    ("i", "íìîĩï"),
    ("o", "óòôõöø"),
    ("u", "úùûũü"),
    ("A", "ÁÀÂÃÄÅ"),
    ("E", "ÉÈÊẼË"),
    ("I", "ÍÌÎĨÏ"),
    ("O", "ÓÒÔÕÖØ"),
    ("U", "ÚÙÛŨÜ"),
    ("n", "ñ"),
    ("c", "ç"),
    ("N", "Ñ"),
    ("C", "Ç"),
    ("d", "Þ"),
    ("ss", "ß"),
    ("ae", "æ"),
    ("oe", "œ"),
) + latin1_map


[docs]def simplify_chars(txt, encoding="ascii", byts=False, amap=None):
    """Remove from ``txt`` all characters not supported by ``encoding``...

    but using a map to "simplify" some characters instead of
    just removing them.

    If ``byts`` is true, return a bytestring.
    """
    if not amap:
        if encoding == "ascii":
            amap = ascii_map
        elif encoding.replace("-", "") in ("latin1", "iso88591"):
            amap = latin1_map
    for plain, funny in amap:
        for f in funny:
            txt = txt.replace(f, plain)
    return txt.encode(encoding, "ignore") if byts else txt


[docs]def to_filename(
    txt: str,
    for_web: bool = False,
    badchars: str = "",
    maxlength: int = 0,
    encoding="latin1",
) -> str:
    """Massage ``txt`` until it is a good filename."""
    txt = simplify_chars(txt, encoding=encoding).strip()
    illegal = "\\/\t:?'\"<>|#$%&*[]•" + badchars
    for c in illegal:
        txt = txt.replace(c, "")
    if maxlength:
        txt = txt[:maxlength].strip()
    if for_web:
        txt = txt.replace(" ", "-").replace("--", "-").replace("--", "-")
    return txt


[docs]def slugify(
    txt: str,
    exists: Callable[[str], bool] = lambda x: False,
    badchars: str = "",
    maxlength: int = 16,
    chars: str = "abcdefghijklmnopqrstuvwxyz23456789",
    min_suffix_length: int = 1,
    max_suffix_length: int = 4,
) -> str:
    """Return a slug that does not yet exist, based on ``txt``.

    You may provide ``exists``, a callback that takes a generated slug and
    checks the database to see if it already exists.

    Each attempt generates a longer suffix in order to keep the number of
    attempts at a minimum.
    """
    slug1 = slug = to_filename(
        txt,
        for_web=True,
        badchars=badchars,
        maxlength=maxlength - max_suffix_length - 1,
    )
    while exists(slug):
        rnd = random_string(min_suffix_length, chars=chars)
        slug = slug1 + "-" + rnd
        if min_suffix_length != max_suffix_length:
            min_suffix_length += 1
    return slug


[docs]def break_lines_near(
    text: str,
    length: int,
    leeway: int = 4,
    whitespace: str = " \r\n\t",
    end_line_break: str = "…",
    start_line_break: str = "…",
) -> List[str]:
    """Return a list of *text* broken in lines of max *length*.

    - ``leeway``: how far to search for whitespace
    - ``whitespace``: characters considered whitespace
    - ``end_line_break``: character to add to the end of broken words
    - ``start_line_break``: character to add to the start of broken words
    """
    if not text:
        return []
    assert leeway >= 0
    assert length > 0
    result = []
    text_buffer = text
    while len(text_buffer) != 0:
        done = False
        if len(text_buffer) <= length:
            result.append(text_buffer)
            text_buffer = ""
        else:
            for pos in range(length, length - (leeway + 1), -1):
                if text_buffer[pos] in whitespace:
                    result.append(text_buffer[0:pos])
                    text_buffer = text_buffer[pos + 1 : len(text_buffer)]
                    done = True
                    break
            if not done:
                result.append(
                    text_buffer[0 : length - len(end_line_break)] + end_line_break
                )
                text_buffer = (
                    start_line_break
                    + text_buffer[length - len(end_line_break) : len(text_buffer)]
                )
    return result


[docs]def find_new_title(dir: str, filename: str) -> str:
    """Return a path that does not exist yet, in ``dir``.

    If ``filename`` exists in ``dir``, adds or changes the
    end of the file title until a name is found that doesn't yet exist.

    For instance, if file "Image (01).jpg" exists in "somedir",
    returns "somedir/Image (02).jpg".
    """
    rx = re.compile(r"\((\d{1,5})\)$")
    p = os.path.join(dir, filename)
    while os.path.exists(p):
        base = os.path.basename(p)
        (root, ext) = os.path.splitext(base)
        m = rx.search(root)
        if m is None:
            replacement = "(001)"
        else:
            increment = int(m.group(1)) + 1
            replacement = "(%03d)" % increment
            root = root[: m.start(1) - 1]
        f = root + replacement + ext
        p = os.path.join(dir, f)
    return p


[docs]def keep_digits(txt: str) -> str:
    """Discard from ``txt`` all non-numeric characters."""
    return "".join(filter(str.isdigit, txt))


[docs]def resist_bad_encoding(txt, possible_encodings=("utf8", "iso-8859-1")):
    """Use this to try to avoid errors from text whose encoding is unknown,
    when erroring out would be worse than possibly displaying garbage.

    Maybe we should use the chardet library instead...
    """
    if not isinstance(txt, str):
        return txt
    best = ""
    for enc in possible_encodings:
        temp = txt.decode(enc, "ignore")
        if len(temp) > len(best):
            best = temp
    return best


[docs]def capitalize(txt: str) -> str:
    """Trim, then turn only the first character into upper case.

    This function can be used as a colander preparer.
    """
    if txt is None or (not isinstance(txt, str) and repr(txt) == "<colander.null>"):
        return txt
    txt = str(txt).strip()
    if txt == "":
        return txt
    val = txt[0].upper()
    if len(txt) > 1:
        val += txt[1:]
    return val


[docs]def strip_preparer(value):
    """Colander preparer that trims whitespace around argument *value*."""
    if isinstance(value, str):
        return value.strip()
    else:
        return value


[docs]def strip_lower_preparer(value):
    """Colander preparer that trims whitespace and converts to lowercase."""
    if isinstance(value, str):
        return value.strip().lower()
    else:
        return value


[docs]def content_of(paths, encoding="utf-8", sep="\n"):
    """Read, join and return the contents of ``paths``.

    Makes it easy to read one or many files.
    """
    if isinstance(paths, Path):
        paths = [str(paths)]
    elif isinstance(paths, str):
        paths = [paths]
    content = []
    for path in paths:
        with codecs.open(path, encoding=encoding) as stream:
            content.append(stream.read())
    return sep.join(content)


[docs]def pluralize(singular: Optional[str]) -> str:
    """Return plural form of given lowercase singular word (English only).

    Based on ActiveState recipe http://code.activestate.com/recipes/413172/

    >>> pluralize('')
    ''
    >>> pluralize('goose')
    'geese'
    >>> pluralize('dolly')
    'dollies'
    >>> pluralize('genius')
    'genii'
    >>> pluralize('jones')
    'joneses'
    >>> pluralize('pass')
    'passes'
    >>> pluralize('zero')
    'zeros'
    >>> pluralize('casino')
    'casinos'
    >>> pluralize('hero')
    'heroes'
    >>> pluralize('church')
    'churches'
    >>> pluralize('x')
    'xs'
    >>> pluralize('car')
    'cars'

    """
    ABERRANT_PLURAL_MAP = {
        "appendix": "appendices",
        "barracks": "barracks",
        "cactus": "cacti",
        "child": "children",
        "criterion": "criteria",
        "deer": "deer",
        "echo": "echoes",
        "elf": "elves",
        "embargo": "embargoes",
        "focus": "foci",
        "fungus": "fungi",
        "goose": "geese",
        "hero": "heroes",
        "hoof": "hooves",
        "index": "indices",
        "knife": "knives",
        "leaf": "leaves",
        "life": "lives",
        "man": "men",
        "mouse": "mice",
        "nucleus": "nuclei",
        "person": "people",
        "phenomenon": "phenomena",
        "potato": "potatoes",
        "self": "selves",
        "syllabus": "syllabi",
        "tomato": "tomatoes",
        "torpedo": "torpedoes",
        "veto": "vetoes",
        "woman": "women",
    }

    VOWELS = frozenset("aeiou")

    if not singular:
        return ""
    plural = ABERRANT_PLURAL_MAP.get(singular)
    if plural:
        return plural
    root = singular
    try:
        if singular[-1] == "y" and singular[-2] not in VOWELS:
            root = singular[:-1]
            suffix = "ies"
        elif singular[-1] == "s":
            if singular[-2] in VOWELS:
                if singular[-3:] == "ius":
                    root = singular[:-2]
                    suffix = "i"
                else:
                    root = singular[:-1]
                    suffix = "ses"
            else:
                suffix = "es"
        elif singular[-2:] in ("ch", "sh"):
            suffix = "es"
        else:
            suffix = "s"
    except IndexError:
        suffix = "s"
    plural = root + suffix
    return plural