"""Functions to manipulate strings."""
import codecs
import random
import os
import re
from datetime import datetime
from pathlib import Path
from typing import Callable, Generator, List, Optional, Tuple # noqa
[docs]def parse_iso_date(txt: str) -> datetime:
"""Parse a datetime in ISO format."""
return datetime.strptime(txt[:19], "%Y-%m-%d %H:%M:%S")
[docs]def shorten(txt: str, length: int = 10, ellipsis: str = "…") -> str:
"""Truncate ``txt``, adding ``ellipsis`` to end, with total ``length``."""
if len(txt) > length:
return txt[: length - len(ellipsis)] + ellipsis
else:
return txt
[docs]def shorten_proper(
name: str, length: int = 11, ellipsis: str = "…", min: int = None
) -> str:
"""Shorten a proper name for displaying."""
min = min or int(length / 2.0)
words = name.split(" ")
output = [] # type: List[str]
ln = -1
while words:
word = words.pop(0)
ln += len(word) + 1
if ln > length:
break
output.append(word)
short = " ".join(output)
return (
short
if short and len(short) >= min
else shorten(name, length=length, ellipsis=ellipsis)
)
[docs]def uncommafy(txt: str, sep: str = ",") -> Generator[str, None, None]:
"""Generate the elements of a comma-separated string.
Takes a comma-delimited string and returns a generator of
stripped strings. No empty string is yielded.
"""
for item in txt.split(sep):
item = item.strip()
if item:
yield item
[docs]def random_string(
length: int,
chars: str = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
) -> str:
"""Return a random string of some `length`."""
return "".join((random.choice(chars) for i in range(length)))
latin1_map = (
('"', "“”"),
("-", "\u2013\u2014\u2022"),
("'", "\u2018\u2019"),
("", "\ufffd\u2122\u2020"),
("...", "\u2026"),
("i", "\u012b"),
("ã", "\u0101"),
("r", "\u0159"),
("Z", "\u017d"),
("z", "\u017e"),
("EUR", "\u20ac"),
) # chars that ISO-8859-1 does not support
ascii_map = (
("a", "áàâãäå\u0101"),
("e", "éèêẽë"),
("i", "íìîĩï"),
("o", "óòôõöø"),
("u", "úùûũü"),
("A", "ÁÀÂÃÄÅ"),
("E", "ÉÈÊẼË"),
("I", "ÍÌÎĨÏ"),
("O", "ÓÒÔÕÖØ"),
("U", "ÚÙÛŨÜ"),
("n", "ñ"),
("c", "ç"),
("N", "Ñ"),
("C", "Ç"),
("d", "Þ"),
("ss", "ß"),
("ae", "æ"),
("oe", "œ"),
) + latin1_map
[docs]def simplify_chars(txt, encoding="ascii", byts=False, amap=None):
"""Remove from ``txt`` all characters not supported by ``encoding``...
but using a map to "simplify" some characters instead of
just removing them.
If ``byts`` is true, return a bytestring.
"""
if not amap:
if encoding == "ascii":
amap = ascii_map
elif encoding.replace("-", "") in ("latin1", "iso88591"):
amap = latin1_map
for plain, funny in amap:
for f in funny:
txt = txt.replace(f, plain)
return txt.encode(encoding, "ignore") if byts else txt
[docs]def to_filename(
txt: str,
for_web: bool = False,
badchars: str = "",
maxlength: int = 0,
encoding="latin1",
) -> str:
"""Massage ``txt`` until it is a good filename."""
txt = simplify_chars(txt, encoding=encoding).strip()
illegal = "\\/\t:?'\"<>|#$%&*[]•" + badchars
for c in illegal:
txt = txt.replace(c, "")
if maxlength:
txt = txt[:maxlength].strip()
if for_web:
txt = txt.replace(" ", "-").replace("--", "-").replace("--", "-")
return txt
[docs]def slugify(
txt: str,
exists: Callable[[str], bool] = lambda x: False,
badchars: str = "",
maxlength: int = 16,
chars: str = "abcdefghijklmnopqrstuvwxyz23456789",
min_suffix_length: int = 1,
max_suffix_length: int = 4,
) -> str:
"""Return a slug that does not yet exist, based on ``txt``.
You may provide ``exists``, a callback that takes a generated slug and
checks the database to see if it already exists.
Each attempt generates a longer suffix in order to keep the number of
attempts at a minimum.
"""
slug1 = slug = to_filename(
txt,
for_web=True,
badchars=badchars,
maxlength=maxlength - max_suffix_length - 1,
)
while exists(slug):
rnd = random_string(min_suffix_length, chars=chars)
slug = slug1 + "-" + rnd
if min_suffix_length != max_suffix_length:
min_suffix_length += 1
return slug
[docs]def break_lines_near(
text: str,
length: int,
leeway: int = 4,
whitespace: str = " \r\n\t",
end_line_break: str = "…",
start_line_break: str = "…",
) -> List[str]:
"""Return a list of *text* broken in lines of max *length*.
- ``leeway``: how far to search for whitespace
- ``whitespace``: characters considered whitespace
- ``end_line_break``: character to add to the end of broken words
- ``start_line_break``: character to add to the start of broken words
"""
if not text:
return []
assert leeway >= 0
assert length > 0
result = []
text_buffer = text
while len(text_buffer) != 0:
done = False
if len(text_buffer) <= length:
result.append(text_buffer)
text_buffer = ""
else:
for pos in range(length, length - (leeway + 1), -1):
if text_buffer[pos] in whitespace:
result.append(text_buffer[0:pos])
text_buffer = text_buffer[pos + 1 : len(text_buffer)]
done = True
break
if not done:
result.append(
text_buffer[0 : length - len(end_line_break)] + end_line_break
)
text_buffer = (
start_line_break
+ text_buffer[length - len(end_line_break) : len(text_buffer)]
)
return result
[docs]def find_new_title(dir: str, filename: str) -> str:
"""Return a path that does not exist yet, in ``dir``.
If ``filename`` exists in ``dir``, adds or changes the
end of the file title until a name is found that doesn't yet exist.
For instance, if file "Image (01).jpg" exists in "somedir",
returns "somedir/Image (02).jpg".
"""
rx = re.compile(r"\((\d{1,5})\)$")
p = os.path.join(dir, filename)
while os.path.exists(p):
base = os.path.basename(p)
(root, ext) = os.path.splitext(base)
m = rx.search(root)
if m is None:
replacement = "(001)"
else:
increment = int(m.group(1)) + 1
replacement = "(%03d)" % increment
root = root[: m.start(1) - 1]
f = root + replacement + ext
p = os.path.join(dir, f)
return p
[docs]def keep_digits(txt: str) -> str:
"""Discard from ``txt`` all non-numeric characters."""
return "".join(filter(str.isdigit, txt))
[docs]def resist_bad_encoding(txt, possible_encodings=("utf8", "iso-8859-1")):
"""Use this to try to avoid errors from text whose encoding is unknown,
when erroring out would be worse than possibly displaying garbage.
Maybe we should use the chardet library instead...
"""
if not isinstance(txt, str):
return txt
best = ""
for enc in possible_encodings:
temp = txt.decode(enc, "ignore")
if len(temp) > len(best):
best = temp
return best
[docs]def capitalize(txt: str) -> str:
"""Trim, then turn only the first character into upper case.
This function can be used as a colander preparer.
"""
if txt is None or (not isinstance(txt, str) and repr(txt) == "<colander.null>"):
return txt
txt = str(txt).strip()
if txt == "":
return txt
val = txt[0].upper()
if len(txt) > 1:
val += txt[1:]
return val
[docs]def strip_preparer(value):
"""Colander preparer that trims whitespace around argument *value*."""
if isinstance(value, str):
return value.strip()
else:
return value
[docs]def strip_lower_preparer(value):
"""Colander preparer that trims whitespace and converts to lowercase."""
if isinstance(value, str):
return value.strip().lower()
else:
return value
[docs]def content_of(paths, encoding="utf-8", sep="\n"):
"""Read, join and return the contents of ``paths``.
Makes it easy to read one or many files.
"""
if isinstance(paths, Path):
paths = [str(paths)]
elif isinstance(paths, str):
paths = [paths]
content = []
for path in paths:
with codecs.open(path, encoding=encoding) as stream:
content.append(stream.read())
return sep.join(content)
[docs]def pluralize(singular: Optional[str]) -> str:
"""Return plural form of given lowercase singular word (English only).
Based on ActiveState recipe http://code.activestate.com/recipes/413172/
>>> pluralize('')
''
>>> pluralize('goose')
'geese'
>>> pluralize('dolly')
'dollies'
>>> pluralize('genius')
'genii'
>>> pluralize('jones')
'joneses'
>>> pluralize('pass')
'passes'
>>> pluralize('zero')
'zeros'
>>> pluralize('casino')
'casinos'
>>> pluralize('hero')
'heroes'
>>> pluralize('church')
'churches'
>>> pluralize('x')
'xs'
>>> pluralize('car')
'cars'
"""
ABERRANT_PLURAL_MAP = {
"appendix": "appendices",
"barracks": "barracks",
"cactus": "cacti",
"child": "children",
"criterion": "criteria",
"deer": "deer",
"echo": "echoes",
"elf": "elves",
"embargo": "embargoes",
"focus": "foci",
"fungus": "fungi",
"goose": "geese",
"hero": "heroes",
"hoof": "hooves",
"index": "indices",
"knife": "knives",
"leaf": "leaves",
"life": "lives",
"man": "men",
"mouse": "mice",
"nucleus": "nuclei",
"person": "people",
"phenomenon": "phenomena",
"potato": "potatoes",
"self": "selves",
"syllabus": "syllabi",
"tomato": "tomatoes",
"torpedo": "torpedoes",
"veto": "vetoes",
"woman": "women",
}
VOWELS = frozenset("aeiou")
if not singular:
return ""
plural = ABERRANT_PLURAL_MAP.get(singular)
if plural:
return plural
root = singular
try:
if singular[-1] == "y" and singular[-2] not in VOWELS:
root = singular[:-1]
suffix = "ies"
elif singular[-1] == "s":
if singular[-2] in VOWELS:
if singular[-3:] == "ius":
root = singular[:-2]
suffix = "i"
else:
root = singular[:-1]
suffix = "ses"
else:
suffix = "es"
elif singular[-2:] in ("ch", "sh"):
suffix = "es"
else:
suffix = "s"
except IndexError:
suffix = "s"
plural = root + suffix
return plural