from collections import Counter
import logging
import re
import unicodedata
from .base import registry
from .conf import get_setting
from .discover import autodiscover
from .exceptions import (
LanguageCodeError,
LanguageDetectionError,
LanguagePackNotFound,
)
LOGGER = logging.getLogger(__name__)
__title__ = 'transliterate.utils'
__author__ = 'Artur Barseghyan'
__copyright__ = '2013-2018 Artur Barseghyan'
__license__ = 'GPL 2.0/LGPL 2.1'
__all__ = (
'detect_language',
'get_available_language_codes',
'get_available_language_packs',
'get_translit_function',
'slugify',
'suggest',
'translit',
)
def _(val):
"""Fake translation wrapper."""
return val
def ensure_autodiscover():
"""Ensure autodiscover."""
# Running autodiscover if registry is empty
if not registry.registry:
autodiscover()
[docs]def get_translit_function(language_code):
"""Return translit function for the language given.
:param str language_code:
:return callable:
"""
ensure_autodiscover()
cls = registry.get(language_code)
if cls is None:
raise LanguagePackNotFound(
_("Language pack for code %s is not found." % language_code)
)
language_pack = cls()
return language_pack.translit
[docs]def translit(value, language_code=None, reversed=False, strict=False):
"""Transliterate the text for the language given.
Language code is optional in case of reversed translations (from some
script to latin).
:param str value:
:param str language_code:
:param bool reversed: If set to True, reversed translation is made.
:param bool strict: If given, all that are not found in the
transliteration pack, are simply stripped out.
:return str:
"""
ensure_autodiscover()
if language_code is None and reversed is False:
raise LanguageCodeError(
_("``language_code`` is optional with ``reversed`` set to True "
"only.")
)
if language_code is None:
language_code = detect_language(value, fail_silently=False)
cls = registry.get(language_code)
if cls is None:
raise LanguagePackNotFound(
_("Language pack for code %s is not found." % language_code)
)
language_pack = cls()
return language_pack.translit(value, reversed=reversed, strict=strict)
[docs]def suggest(value, language_code=None, reversed=False, limit=None):
"""Suggest possible variants.
:param str value:
:param str language_code:
:param bool reversed: If set to True, reversed translation is made.
:param int limit: Limit number of suggested variants.
:return list:
"""
ensure_autodiscover()
if language_code is None and reversed is False:
raise LanguageCodeError(
_("``language_code`` is optional with ``reversed`` set to True "
"only.")
)
cls = registry.get(language_code)
if cls is None:
raise LanguagePackNotFound(
_("Language pack for code %s is not found." % language_code)
)
language_pack = cls()
return language_pack.suggest(value, reversed=reversed, limit=limit)
[docs]def get_available_language_codes():
"""Get list of language codes for registered language packs.
:return list:
"""
ensure_autodiscover()
return [key for (key, val) in registry.registry.items()]
[docs]def get_available_language_packs():
"""Get list of registered language packs.
:return list:
"""
ensure_autodiscover()
return [val for (key, val) in registry.registry.items()]
def get_language_pack(language_code):
"""Get registered language pack by language code given.
:param str language_code:
:return transliterate.base.TranslitLanguagePack: Returns None on failure.
"""
ensure_autodiscover()
return registry.registry.get(language_code, None)
# Strips numbers from unicode string.
def strip_numbers(text):
"""Strip numbers from text."""
return ''.join(filter(lambda u: not u.isdigit(), text))
def extract_most_common_words(text, num_words=None):
"""Extract most common words.
:param unicode text:
:param int num_words:
:return list:
"""
if num_words is None:
num_words = get_setting('LANGUAGE_DETECTION_MAX_NUM_KEYWORDS')
text = strip_numbers(text)
counter = Counter()
for word in text.split(' '):
if len(word) > 1:
counter[word] += 1
return counter.most_common(num_words)
[docs]def detect_language(text, num_words=None, fail_silently=True,
heavy_check=False):
"""Detect the language from the value given.
Detect the language from the value given based on ranges defined in active
language packs.
:param unicode value: Input string.
:param int num_words: Number of words to base decision on.
:param bool fail_silently:
:param bool heavy_check: If given, heavy checks would be applied when
simple checks don't give any results. Heavy checks are language
specific and do not apply to a common logic. Heavy language detection
is defined in the ``detect`` method of each language pack.
:return str: Language code.
"""
ensure_autodiscover()
if num_words is None:
num_words = get_setting('LANGUAGE_DETECTION_MAX_NUM_KEYWORDS')
most_common_words = extract_most_common_words(text, num_words=num_words)
counter = Counter()
available_language_packs = get_available_language_packs()
for word, occurrences in most_common_words:
for letter in word:
for language_pack in available_language_packs:
if language_pack.detectable and language_pack.contains(letter):
counter[language_pack.language_code] += 1
continue
try:
return counter.most_common(1)[0][0]
except Exception as err:
if get_setting('DEBUG'):
LOGGER.debug(str(err))
if not fail_silently:
raise LanguageDetectionError(
_("""Can't detect language for the text "%s" given.""") % text
)
[docs]def slugify(text, language_code=None):
"""Slugify the given text.
If no ``language_code`` is given, auto-detect the language code from
text given.
:param str text:
:param str language_code:
:return str:
"""
if not language_code:
language_code = detect_language(text)
if language_code:
transliterated_text = translit(text, language_code, reversed=True)
slug = unicodedata.normalize('NFKD', transliterated_text) \
.encode('ascii', 'ignore') \
.decode('ascii')
slug = re.sub(r'[^\w\s-]', '', slug).strip().lower()
return re.sub(r'[-\s]+', '-', slug)