Source code for kwutil.slugify_ext

"""
Fork of python-slugify.

https://pypi.org/project/python-slugify/1.2.2/
"""
import re
import unicodedata
import sys
from html.entities import name2codepoint
_unicode = str
_unicode_type = str
unichr = chr

__all__ = ['slugify', 'smart_truncate']


CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
DECIMAL_PATTERN = re.compile(r'&#(\d+);')
HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
QUOTE_PATTERN = re.compile(r'[\']+')
ALLOWED_CHARS_PATTERN = re.compile(r'[^-a-z0-9]+')
ALLOWED_CHARS_PATTERN_WITH_UPPERCASE = re.compile(r'[^-a-zA-Z0-9]+')
DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
DEFAULT_SEPARATOR = '-'


def _trunc_op(string, max_length, trunc_loc, hash_len=None, head='~', tail='~'):
    """
    max_length

    string = 'DarnOvercastSculptureTipperBlazerConcaveUnsuitedDerangedHexagonRockband'
    max_length = 16
    trunc_loc = 0.5
    _trunc_op(string, max_length, trunc_loc)
    """
    total_len = len(string)
    mid_pos = int(total_len * trunc_loc)

    num_remove = max(total_len - max_length, 1)
    import ubelt as ub
    import math
    if hash_len is None:
        recommend = min(max(4, int(math.ceil(math.log(num_remove)))), 32)
        hash_len = min(max_length, min(num_remove, recommend))

    num_insert = hash_len + len(head) + len(tail)

    actual_remove = num_remove + num_insert

    low_pos = max(0, (mid_pos - (actual_remove) // 2))
    high_pos = min(total_len, (mid_pos + (actual_remove) // 2))
    if low_pos <= 0:
        n_extra = actual_remove - (high_pos - low_pos)
        high_pos += n_extra
    if high_pos >= total_len:
        n_extra = actual_remove - (high_pos - low_pos)
        low_pos -= n_extra

    really_removed = (high_pos - low_pos)
    high_pos += (really_removed - actual_remove)

    begin = string[:low_pos]
    mid = string[low_pos:high_pos]
    end = string[high_pos:]

    mid = ub.hash_data(string)[0:hash_len]
    trunc_text = ''.join([begin, head, mid, tail, end])
    return trunc_text


[docs] def smart_truncate(string, max_length=0, word_boundary=False, separator=' ', save_order=False, trunc_loc=0.5, hash_len=None, head='~', tail='~'): """ Truncate a string to a maximum length, replacing the truncated part with a hash representing the removed part. Args: string (str): string for modification max_length (int): output string length word_boundary (bool): save_order (bool): if True then word order of output string is like input string separator (str): separator between words trunc_loc (float): fraction of location where to remove the text Returns: str Example: >>> from kwutil.slugify_ext import smart_truncate >>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=0.0)) >>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=0.5)) >>> print(smart_truncate('f' * 100, max_length=10, trunc_loc=1.0)) ~944b6~fff ff~944b6~fff fff~944b6~ """ string = string.strip(separator) if not max_length: return string if len(string) < max_length: return string if not word_boundary: return _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail).strip(separator) if separator not in string: return _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail) # hack truncated = '' # for word in string.split(separator): # if word: # next_len = len(truncated) + len(word) # if next_len < max_length: # truncated += '{}{}'.format(word, separator) # elif next_len == max_length: # truncated += '{}'.format(word) # break # else: # if save_order: # break if not truncated: # pragma: no cover truncated = _trunc_op(string, max_length, trunc_loc, hash_len=hash_len, head=head, tail=tail) return truncated.strip(separator)
[docs] def slugify(text, entities=True, decimal=True, hexadecimal=True, max_length=0, word_boundary=False, separator=DEFAULT_SEPARATOR, save_order=False, stopwords=(), regex_pattern=None, lowercase=True, replacements=(), trunc_loc=1.0): """ Make a slug from the given text. :param text (str): initial text :param entities (bool): converts html entities to unicode :param decimal (bool): converts html decimal to unicode :param hexadecimal (bool): converts html hexadecimal to unicode :param max_length (int): output string length :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order :param separator (str): separator between words :param stopwords (iterable): words to discount :param regex_pattern (str): regex pattern for allowed characters :param lowercase (bool): activate case sensitivity by setting it to False :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']] :return (str): # Example: # >>> from kwutil.slugify_ext import slugify # NOQA # >>> import ubelt as ub # >>> text = ub.cmd('diceware -n 12')['out'].strip() # >>> print('text = {!r}'.format(text)) # >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=1.0) # >>> print('slug = {!r}'.format(slug)) # >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.8) # >>> print('slug = {!r}'.format(slug)) # >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.5) # >>> print('slug = {!r}'.format(slug)) # >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.2) # >>> print('slug = {!r}'.format(slug)) # >>> slug = slugify(text, max_length=10, lowercase=0, trunc_loc=0.0) # >>> print('slug = {!r}'.format(slug)) """ # user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # ensure text is unicode if not isinstance(text, _unicode_type): text = _unicode(text, 'utf-8', 'ignore') # replace quotes with dashes - pre-process text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text) # decode unicode try: try: import text_unidecode as unidecode except ImportError: import unidecode except ImportError: import warnings warnings.warn('Warning text_unidecode, used by kwutil.slugify_ext is not available') else: text = unidecode.unidecode(text) # ensure text is still in unicode if not isinstance(text, _unicode_type): text = _unicode(text, 'utf-8', 'ignore') # character entity reference if entities: text = CHAR_ENTITY_PATTERN.sub(lambda m: unichr(name2codepoint[m.group(1)]), text) # decimal character reference if decimal: try: text = DECIMAL_PATTERN.sub(lambda m: unichr(int(m.group(1))), text) except Exception: pass # hexadecimal character reference if hexadecimal: try: text = HEX_PATTERN.sub(lambda m: unichr(int(m.group(1), 16)), text) except Exception: pass # translate text = unicodedata.normalize('NFKD', text) if sys.version_info < (3,): text = text.encode('ascii', 'ignore') # make the text lowercase (optional) if lowercase: text = text.lower() # remove generated quotes -- post-process text = QUOTE_PATTERN.sub('', text) # cleanup numbers text = NUMBERS_PATTERN.sub('', text) # replace all other unwanted characters if lowercase: pattern = regex_pattern or ALLOWED_CHARS_PATTERN else: pattern = regex_pattern or ALLOWED_CHARS_PATTERN_WITH_UPPERCASE text = re.sub(pattern, DEFAULT_SEPARATOR, text) # remove redundant text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR) # remove stopwords if stopwords: if lowercase: stopwords_lower = [s.lower() for s in stopwords] words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower] else: words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords] text = DEFAULT_SEPARATOR.join(words) # finalize user-specific replacements if replacements: for old, new in replacements: text = text.replace(old, new) # smart truncate if requested if max_length > 0: text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order, trunc_loc=trunc_loc) if separator != DEFAULT_SEPARATOR: text = text.replace(DEFAULT_SEPARATOR, separator) return text