# -*- coding: utf-8 -*-
import re
import unicodedata
import six
from .exceptions import (
ImproperlyConfigured,
InvalidRegistryItemType
)
__title__ = 'transliterate.base'
__author__ = 'Artur Barseghyan'
__copyright__ = '2013-2018 Artur Barseghyan'
__license__ = 'GPL 2.0/LGPL 2.1'
__all__ = (
'registry',
'TranslitLanguagePack',
)
[docs]class TranslitLanguagePack(object):
"""Base language pack.
The attributes below shall be defined in every language pack.
``language_code``: Language code (obligatory). Example value: 'hy', 'ru'.
``language_name``: Language name (obligatory). Example value: 'Armenian',
'Russian'.
``character_ranges``: Character ranges that are specific to the language.
When making a pack, check `this
<http://en.wikipedia.org/wiki/List_of_Unicode_characters>`_ page for
the ranges.
``mapping``: Mapping (obligatory). A tuple, consisting of two strings
(source and target). Example value: (u'abc', u'աբց').
``reversed_specific_mapping``: Specific mapping (one direction only) used
when transliterating from target script to source script (reversed
transliteration).
՝՝pre_processor_mapping՝՝: Pre processor mapping (optional). A dictionary
mapping for letters that can't be represented by a single latin letter.
՝՝reversed_specific_pre_processor_mapping՝՝: Pre processor mapping (
optional). A dictionary mapping for letters that can't be represented
by a single latin letter (reversed transliteration).
:example:
>>> class ArmenianLanguagePack(TranslitLanguagePack):
>>> language_code = "hy"
>>> language_name = "Armenian"
>>> character_ranges = ((0x0530, 0x058F), (0xFB10, 0xFB1F))
>>> mapping = (
>>> u"abgdezilxkhmjnpsvtrcq&ofABGDEZILXKHMJNPSVTRCQOF", # Source script
>>> u"աբգդեզիլխկհմյնպսվտրցքևօֆԱԲԳԴԵԶԻԼԽԿՀՄՅՆՊՍՎՏՐՑՔՕՖ", # Target script
>>> )
>>> reversed_specific_mapping = (
>>> u"ռՌ",
>>> u"rR"
>>> )
>>> pre_processor_mapping = {
>>> # lowercase
>>> u"e'": u"է",
>>> u"y": u"ը",
>>> u"th": u"թ",
>>> u"jh": u"ժ",
>>> u"ts": u"ծ",
>>> u"dz": u"ձ",
>>> u"gh": u"ղ",
>>> u"tch": u"ճ",
>>> u"sh": u"շ",
>>> u"vo": u"ո",
>>> u"ch": u"չ",
>>> u"dj": u"ջ",
>>> u"ph": u"փ",
>>> u"u": u"ու",
>>>
>>> # uppercase
>>> u"E'": u"Է",
>>> u"Y": u"Ը",
>>> u"Th": u"Թ",
>>> u"Jh": u"Ժ",
>>> u"Ts": u"Ծ",
>>> u"Dz": u"Ձ",
>>> u"Gh": u"Ղ",
>>> u"Tch": u"Ճ",
>>> u"Sh": u"Շ",
>>> u"Vo": u"Ո",
>>> u"Ch": u"Չ",
>>> u"Dj": u"Ջ",
>>> u"Ph": u"Փ",
>>> u"U": u"Ու"
>>> }
>>> reversed_specific_pre_processor_mapping = {
>>> u"ու": u"u",
>>> u"Ու": u"U"
>>> }
Note, that in Python 3 you won't be using u prefix before the strings.
"""
language_code = None
language_name = None
character_ranges = None
mapping = None
reversed_specific_mapping = None
reversed_pre_processor_mapping_keys = []
reversed_specific_pre_processor_mapping = None
reversed_specific_pre_processor_mapping_keys = []
pre_processor_mapping = None
pre_processor_mapping_keys = []
detectable = False
characters = None
reversed_characters = None
def __init__(self):
try:
assert self.language_code is not None
assert self.language_name is not None
assert self.mapping
except AssertionError:
raise ImproperlyConfigured(
"You should define ``language_code``, ``language_name`` and "
"``mapping`` properties in your subclassed "
"``TranslitLanguagePack`` class."
)
super(TranslitLanguagePack, self).__init__()
# Creating a translation table from the mapping set.
self.translation_table = {}
for key, val in zip(*self.mapping):
self.translation_table.update({ord(key): ord(val)})
# Creating a reversed translation table.
self.reversed_translation_table = dict(
zip(self.translation_table.values(), self.translation_table.keys())
)
# If any pre-processor rules defined, reversing them for later use.
if self.pre_processor_mapping:
self.reversed_pre_processor_mapping = dict(
zip(
self.pre_processor_mapping.values(),
self.pre_processor_mapping.keys())
)
self.pre_processor_mapping_keys = self.pre_processor_mapping.keys()
self.reversed_pre_processor_mapping_keys = \
self.pre_processor_mapping.values()
else:
self.reversed_pre_processor_mapping = None
if self.reversed_specific_mapping:
self.reversed_specific_translation_table = {}
for key, val in zip(*self.reversed_specific_mapping):
self.reversed_specific_translation_table.update(
{ord(key): ord(val)}
)
if self.reversed_specific_pre_processor_mapping:
self.reversed_specific_pre_processor_mapping_keys = \
self.reversed_specific_pre_processor_mapping.keys()
self._characters = '[^]'
if self.characters is not None:
self._characters = '[^{0}]'.format(
'\\'.join(list(self.characters))
)
self._reversed_characters = '[^]'
if self.reversed_characters is not None:
self._reversed_characters = \
'[^{0}]'.format('\\'.join(list(self.characters)))
[docs] def translit(self, value, reversed=False, strict=False,
fail_silently=True):
"""Transliterate the given value according to the rules.
Rules are set in the transliteration pack.
:param str value:
:param bool reversed:
:param bool strict:
:param bool fail_silently:
:return str:
"""
if not six.PY3:
value = unicode(value)
if reversed:
# Handling reversed specific translations (one side only).
if self.reversed_specific_mapping:
value = value.translate(
self.reversed_specific_translation_table
)
if self.reversed_specific_pre_processor_mapping:
for rule in self.reversed_specific_pre_processor_mapping_keys:
value = value.replace(
rule,
self.reversed_specific_pre_processor_mapping[rule]
)
# Handling pre-processor mappings.
if self.reversed_pre_processor_mapping:
for rule in self.reversed_pre_processor_mapping_keys:
value = value.replace(
rule,
self.reversed_pre_processor_mapping[rule]
)
return value.translate(self.reversed_translation_table)
if self.pre_processor_mapping:
for rule in self.pre_processor_mapping_keys:
value = value.replace(rule, self.pre_processor_mapping[rule])
res = value.translate(self.translation_table)
if strict:
res = self._make_strict(value=res,
reversed=reversed,
fail_silently=fail_silently)
return res
def _make_strict(self, value, reversed=False, fail_silently=True):
"""Strip out unnecessary characters from the string.
:param string value:
:param bool reversed:
:param bool fail_silently:
:return string:
"""
try:
return self.make_strict(value, reversed)
except Exception as err:
if fail_silently:
return value
else:
raise err
[docs] def make_strict(self, value, reversed=False):
"""Strip out unnecessary characters from the string.
:param string value:
:param bool reversed:
:return string:
"""
if reversed:
if self.reversed_characters:
# Make strict based on the ``reversed_characters``.
value = re.sub(self._reversed_characters, '', value)
else:
# Make strict based on the ``character_ranges`` specified.
pass
else:
if self.characters:
# Make strict based on the ``characters``.
value = re.sub(self._characters, '', value)
else:
# Make strict based on the ``character_ranges`` specified.
pass
return value
[docs] @classmethod
def contains(cls, character):
"""Check if given character belongs to the language pack.
:return bool:
"""
if cls.character_ranges:
char_num = unicodedata.normalize('NFC', character)
char_num = ord(char_num)
for character_range in cls.character_ranges:
range_lower = character_range[0]
range_upper = character_range[1]
if char_num >= range_lower and char_num <= range_upper:
return True
return False
[docs] @classmethod
def suggest(value, reversed=False, limit=None):
"""Suggest possible variants (some sort of auto-complete).
:param str value:
:param int limit: Limit number of suggested variants.
:return list:
"""
# TODO
[docs] @classmethod
def detect(text, num_words=None):
"""Detect the language.
Heavy language detection, which is activated for languages that are
harder detect (like Russian Cyrillic and Ukrainian Cyrillic).
:param unicode value: Input string.
:param int num_words: Number of words to base decision on.
:return bool: True if detected and False otherwise.
"""
# TODO
class TranslitRegistry(object):
"""Language pack registry."""
def __init__(self):
self._registry = {}
self._forced = []
@property
def registry(self):
"""Registry."""
return self._registry
def register(self, cls, force=False):
"""Register the language pack in the registry.
:param transliterate.base.LanguagePack cls: Subclass of
``transliterate.base.LanguagePack``.
:param bool force: If set to True, item stays forced. It's not possible
to un-register a forced item.
:return bool: True if registered and False otherwise.
"""
if not issubclass(cls, TranslitLanguagePack):
raise InvalidRegistryItemType(
"Invalid item type `%s` for registry `%s`",
cls,
self.__class__
)
# If item has not been forced yet, add/replace its' value in the
# registry.
if force:
if cls.language_code not in self._forced:
self._registry[cls.language_code] = cls
self._forced.append(cls.language_code)
return True
else:
return False
else:
if cls.language_code in self._registry:
return False
else:
self._registry[cls.language_code] = cls
return True
def unregister(self, cls):
"""Un-registers an item from registry.
:param transliterate.base.LanguagePack cls: Subclass of
``transliterate.base.LanguagePack``.
:return bool: True if unregistered and False otherwise.
"""
if not issubclass(cls, TranslitLanguagePack):
raise InvalidRegistryItemType(
"Invalid item type `%s` for registry `%s`",
cls,
self.__class__
)
# Only non-forced items are allowed to be unregistered.
if cls.language_code in self._registry \
and cls.language_code not in self._forced:
self._registry.pop(cls.language_code)
return True
else:
return False
def get(self, language_code, default=None):
"""Get the given language pack from the registry.
:param str language_code:
:return transliterate.base.LanguagePack: Subclass of
``transliterate.base.LanguagePack``.
"""
return self._registry.get(language_code, default)
# Register languages by calling registry.register()
registry = TranslitRegistry()