"""
Fork of python-slugify.
https://pypi.org/project/python-slugify/1.2.2/
"""
import re
import unicodedata
import sys
from html.entities import name2codepoint
_unicode = str
_unicode_type = str
unichr = chr
__all__ = ['slugify', 'smart_truncate']
CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+')
ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'
def _trunc_op(string, max_length, trunc_loc, hash_len=None, head='~', tail='~'):
"""
max_length
string = 'DarnOvercastSculptureTipperBlazerConcaveUnsuitedDerangedHexagonRockband'
max_length = 16
trunc_loc = 0.5
_trunc_op(string, max_length, trunc_loc)
"""
total_len = len(string)
mid_pos = int(total_len * trunc_loc)
num_remove = max(total_len - max_length, 1)
import ubelt as ub
import math
if hash_len is None:
recommend = min(max(4, int(math.ceil(math.log(num_remove)))), 32)
hash_len = min(max_length, min(num_remove, recommend))
num_insert = hash_len + len(head) + len(tail)
actual_remove = num_remove + num_insert
low_pos = max(0, (mid_pos - (actual_remove) // 2))
high_pos = min(total_len, (mid_pos + (actual_remove) // 2))
if low_pos <= 0:
n_extra = actual_remove - (high_pos - low_pos)
high_pos += n_extra
if high_pos >= total_len:
n_extra = actual_remove - (high_pos - low_pos)
low_pos -= n_extra
really_removed = (high_pos - low_pos)
high_pos += (really_removed - actual_remove)
begin = string[:low_pos]
mid = string[low_pos:high_pos]
end = string[high_pos:]
mid = ub.hash_data(string)[0:hash_len]
trunc_text = ''.join([begin, head, mid, tail, end])
return trunc_text
[docs]
def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False, trunc_loc=0.5, hash_len=None, head='~', tail='~'):
"""
Truncate a string to a maximum length, replacing the truncated part with a
hash representing the removed part.
Args:
string (str): string for modification
max_length (int): output string length
word_boundary (bool):
save_order (bool): if True then word order of output string is like input string
separator (str): separator between words
trunc_loc (float):
fraction of location where to remove the text
Returns:
str
Example:
>>> from kwutil.slugify_ext import smart_truncate
>>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=0.0))
>>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=0.5))
>>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=1.0))
~944b6~fff
ff~944b6~fff
fff~944b6~
"""
string = string.strip(separator)
if not max_length:
return string
if len(string) < max_length:
return string
if not word_boundary:
return _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail).strip(separator)
if separator not in string:
return _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail)
# hack
truncated = ''
# for word in string.split(separator):
# if word:
# next_len = len(truncated) + len(word)
# if next_len < max_length:
# truncated += '{}{}'.format(word, separator)
# elif next_len == max_length:
# truncated += '{}'.format(word)
# break
# else:
# if save_order:
# break
if not truncated: # pragma: no cover
truncated = _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail)
return truncated.strip(separator)
[docs]
def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False,
separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True,
replacements=(), trunc_loc=1.0):
"""
Make a slug from the given text.
:param text (str): initial text
:param entities (bool): converts html entities to unicode
:param decimal (bool): converts html decimal to unicode
:param hexadecimal (bool): converts html hexadecimal to unicode
:param max_length (int): output string length
:param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
:param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
:param separator (str): separator between words
:param stopwords (iterable): words to discount
:param regex_pattern (str): regex pattern for allowed characters
:param lowercase (bool): activate case sensitivity by setting it to False
:param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
:return (str):
# Example:
# >>> from kwutil.slugify_ext import slugify # NOQA
# >>> import ubelt as ub
# >>> text = ub.cmd('diceware -n 12')['out'].strip()
# >>> print('text = {!r}'.format(text))
# >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=1.0)
# >>> print('slug = {!r}'.format(slug))
# >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.8)
# >>> print('slug = {!r}'.format(slug))
# >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.5)
# >>> print('slug = {!r}'.format(slug))
# >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.2)
# >>> print('slug = {!r}'.format(slug))
# >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.0)
# >>> print('slug = {!r}'.format(slug))
"""
# user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# ensure text is unicode
if not isinstance(text, _unicode_type):
text = _unicode(text, 'utf-8', 'ignore')
# replace quotes with dashes - pre-process
text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
# decode unicode
try:
try:
import text_unidecode as unidecode
except ImportError:
import unidecode
except ImportError:
import warnings
warnings.warn('Warning text_unidecode, used by kwutil.slugify_ext is not available')
else:
text = unidecode.unidecode(text)
# ensure text is still in unicode
if not isinstance(text, _unicode_type):
text = _unicode(text, 'utf-8', 'ignore')
# character entity reference
if entities:
text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text)
# decimal character reference
if decimal:
try:
text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text)
except Exception:
pass
# hexadecimal character reference
if hexadecimal:
try:
text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text)
except Exception:
pass
# translate
text = unicodedata.normalize('NFKD', text)
if sys.version_info < (3,):
text = text.encode('ascii', 'ignore')
# make the text lowercase (optional)
if lowercase:
text = text.lower()
# remove generated quotes -- post-process
text = QUOTE_PATTERN.sub('', text)
# cleanup numbers
text = NUMBERS_PATTERN.sub('', text)
# replace all other unwanted characters
if lowercase:
pattern = regex_pattern or ALLOWED_CHARS_PATTERN
else:
pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE
text = re.sub(pattern, DEFAULT_SEPARATOR, text)
# remove redundant
text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
# remove stopwords
if stopwords:
if lowercase:
stopwords_lower = [s.lower() for s in stopwords]
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
else:
words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
text = DEFAULT_SEPARATOR.join(words)
# finalize user-specific replacements
if replacements:
for old, new in replacements:
text = text.replace(old, new)
# smart truncate if requested
if max_length > 0:
text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order, trunc_loc=trunc_loc)
if separator != DEFAULT_SEPARATOR:
text = text.replace(DEFAULT_SEPARATOR, separator)
return text