Source code for transliterate.utils

import logging
import unicodedata
import re

try:
    from collections import Counter
except ImportError:
    from transliterate.backports.collections import Counter

from transliterate.discover import autodiscover
from transliterate.base import registry
from transliterate.exceptions import (
    LanguageCodeError, LanguagePackNotFound, LanguageDetectionError
)
from transliterate.conf import get_setting

logger = logging.getLogger(__file__)

_ = lambda s: s

__title__ = 'transliterate.utils'
__author__ = 'Artur Barseghyan'
__copyright__ = '2013-2016 Artur Barseghyan'
__license__ = 'GPL 2.0/LGPL 2.1'
__all__ = (
    'translit', 'get_available_languages', 'suggest', 'detect_language',
    'slugify',
)

def ensure_autodiscover():
    """
    Ensure autodiscover.
    """
    # Running autodiscover if registry is empty
    if not registry._registry:
        autodiscover()

[docs]def translit(value, language_code=None, reversed=False, strict=False): """ Transliterates the text for the language given. Language code is optional in case of reversed translations (from some script to latin). :param str value: :param str language_code: :param bool reversed: If set to True, reversed translation is made. :param bool strict: If given, all that are not found in the transliteration pack, are simply stripped out. :return str: """ ensure_autodiscover() if language_code is None and reversed is False: raise LanguageCodeError( _("``language_code`` is optional with ``reversed`` set to True " "only.") ) if language_code is None: language_code = detect_language(value, fail_silently=False) cls = registry.get(language_code) if cls is None: raise LanguagePackNotFound( _("Language pack for code %s is not found." % language_code) ) language_pack = cls() return language_pack.translit(value, reversed=reversed, strict=strict)
[docs]def suggest(value, language_code=None, reversed=False, limit=None): """ Suggest possible variants. :param str value: :param str language_code: :param bool reversed: If set to True, reversed translation is made. :param int limit: Limit number of suggested variants. :return list: """ ensure_autodiscover() if language_code is None and reversed is False: raise LanguageCodeError( _("``language_code`` is optional with ``reversed`` set to True " "only.") ) cls = registry.get(language_code) if cls is None: raise LanguagePackNotFound( _("Language pack for code %s is not found." % language_code) ) language_pack = cls() return language_pack.suggest(value, reversed=reversed, limit=limit)
[docs]def get_available_language_codes(): """ Gets list of language codes for registered language packs. :return list: """ ensure_autodiscover() return [k for k, v in registry._registry.items()]
[docs]def get_available_language_packs(): """ Gets list of registered language packs. :return list: """ ensure_autodiscover() return [v for k, v in registry._registry.items()]
def get_language_pack(language_code): """ Get registered language pack by language code given. :param str language_code: :return transliterate.base.TranslitLanguagePack: Returns None on failure. """ ensure_autodiscover() return registry._registry.get(language_code, None) # Strips numbers from unicode string. strip_numbers = lambda text: ''.join(filter(lambda u: not u.isdigit(), text)) def extract_most_common_words(text, num_words=None): """ Extracts most common words. :param unicode text: :param int num_words: :return list: """ if num_words is None: num_words = get_setting('LANGUAGE_DETECTION_MAX_NUM_KEYWORDS') text = strip_numbers(text) counter = Counter() for word in text.split(' '): if len(word) > 1: counter[word] += 1 return counter.most_common(num_words)
[docs]def detect_language(text, num_words=None, fail_silently=True, heavy_check=False): """ Detects the language from the value given based on ranges defined in active language packs. :param unicode value: Input string. :param int num_words: Number of words to base decision on. :param bool fail_silently: :param bool heavy_check: If given, heavy checks would be applied when simple checks don't give any results. Heavy checks are language specific and do not apply to a common logic. Heavy language detection is defined in the ``detect`` method of each language pack. :return str: Language code. """ ensure_autodiscover() if num_words is None: num_words = get_setting('LANGUAGE_DETECTION_MAX_NUM_KEYWORDS') most_common_words = extract_most_common_words(text, num_words=num_words) counter = Counter() available_language_packs = get_available_language_packs() for word, occurrencies in most_common_words: for letter in word: for language_pack in available_language_packs: if language_pack.detectable and language_pack.contains(letter): counter[language_pack.language_code] += 1 continue try: return counter.most_common(1)[0][0] except Exception as e: if get_setting('DEBUG'): logger.debug(str(e)) if not fail_silently: raise LanguageDetectionError( _("""Can't detect language for the text "%s" given.""") % text )
[docs]def slugify(text, language_code=None): """ Slugifies the given text. If no ``language_code`` is given, auto-detects the language code from text given. :param unicode text: :return str: """ if not language_code: language_code = detect_language(text) if language_code: transliterated_text = translit(text, language_code, reversed=True) slug = unicodedata.normalize('NFKD', transliterated_text) \ .encode('ascii', 'ignore') \ .decode('ascii') slug = re.sub('[^\w\s-]', '', slug).strip().lower() return re.sub('[-\s]+', '-', slug)