Skip to content

Commit b3d952f

Browse files
authored
Merge pull request #99 from hmlendea/tatar
Created a transliterator for `Tatar`
2 parents a00c4ea + ce4ebe0 commit b3d952f

File tree

3 files changed

+181
-0
lines changed

3 files changed

+181
-0
lines changed

TransliterationAPI.UnitTests/Service/Transliterators/CyrillicTransliteratorTests.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,26 @@ public void GivenATextInTajikCyrillicScript_WhenTransliteratingIntoLatin_ThenThe
509509
}
510510
}
511511

512+
[Test]
513+
[TestCase("Азнакай", "Aznaqay")]
514+
[TestCase("Алабуга", "Alabuğa")]
515+
[TestCase("Әлмәт", "Älmät")]
516+
[TestCase("Баулы", "Baulı")]
517+
[TestCase("Бөгелмә", "Bögelmä")]
518+
[TestCase("Казань", "Qazan")]
519+
[TestCase("Яр Чаллы", "Yar Çallı")]
520+
[TestCase("Яшел Үзән", "Yäşel Üzän")]
521+
[TestCase("Барлык кешеләр дә азат һәм үз абруйлары һәм хокуклары ягыннан тиң булып туалар. Аларга акыл һәм вөҗдан бирелгән һәм бер-берсенә карата туганнарча мөнасәбәттә булырга тиешләр.", "Barlıq keşelär dä azat häm üz abruyları häm xoquqları yağınnan tiñ bulıp tualar. Alarğa aqıl häm wöcdan birelgän häm ber-bersenä qarata tuğannarça mönasäbättä bulırğa tiyeşlär.")]
522+
public void GivenATextInTatarCyrillicScript_WhenTransliteratingIntoLatin_ThenTheCorrectTextIsReturned(
523+
string tatarText,
524+
string expectedTransliteratedText)
525+
{
526+
foreach (Language language in new List<Language> { Language.Tatar, Language.TatarCyrillic })
527+
{
528+
Assert.That(transliterator.Transliterate(tatarText, language), Is.EqualTo(expectedTransliteratedText));
529+
}
530+
}
531+
512532
[Test]
513533
[TestCase("Алчевськ", "Alchevsk")]
514534
[TestCase("Бердянськ", "Berdiansk")]

TransliterationAPI/Service/Entities/Language.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ public sealed class Language : IEquatable<Language>
5454
public static Language Tajik => new("tg", nameof(Tajik), nameof(CyrillicTransliterator));
5555
public static Language TajikCyrillic => new("tg-cyrl", nameof(Tajik), nameof(CyrillicTransliterator));
5656
public static Language Tamil => new("ta", nameof(Tamil), nameof(UshuaiaTransliterator));
57+
public static Language Tatar => new("tt", nameof(Tatar), nameof(CyrillicTransliterator));
58+
public static Language TatarCyrillic => new("tt-cyrl", nameof(Tatar), nameof(CyrillicTransliterator));
5759
public static Language Telugu => new("te", nameof(Telugu), nameof(UshuaiaTransliterator));
5860
public static Language Udmurt => new("udm", nameof(Udmurt), nameof(TranslitterationDotComTransliterator));
5961
public static Language Ukrainian => new("uk", nameof(Ukrainian), nameof(CyrillicTransliterator));

TransliterationAPI/Service/Transliterators/CyrillicTransliterator.cs

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ public class CyrillicTransliterator : ITransliterator
2020
readonly Dictionary<string, string> russianTransliterationTable;
2121
readonly Dictionary<string, string> macedonianTransliterationTable;
2222
readonly Dictionary<string, string> serbianTransliterationTable;
23+
readonly Dictionary<string, string> tatarTransliterationTable;
2324
readonly Dictionary<string, string> tajikTransliterationTable;
2425
readonly Dictionary<string, string> ukrainianTransliterationTable;
2526

@@ -747,6 +748,154 @@ public CyrillicTransliterator()
747748
{ "я", "ja" },
748749
};
749750

751+
tatarTransliterationTable = new()
752+
{
753+
// Front vowels: [ÄäEeİiÖöÜüӘәЕеИиӨөҮү]
754+
// Back vowels: [AaIıOoUuАаЫыОоУу]
755+
756+
// Uppercase vowel harmony
757+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү])Е", "$1Ye" },
758+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)Г", "$1G" },
759+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)К", "$1K" },
760+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)Ю", "$1Yü" },
761+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)Я", "$1Yä" },
762+
{ "([AaIıOoUuАаЫыОоУу])Е", "$1Yı" },
763+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)Г", "$1Ğ" },
764+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)К", "$1Q" },
765+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)Ю", "$1Yu" },
766+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)Я", "$1Ya" },
767+
{ "Г([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "G$1" },
768+
{ "Г([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "Ğ$1" },
769+
{ "К([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "K$1" },
770+
{ "К([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "Q$1" },
771+
{ "Ю([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "Yü$1" },
772+
{ "Ю([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "Yu$1" },
773+
{ "Я([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "Yä$1" },
774+
{ "Я([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "Ya$1" },
775+
776+
// Lowercase vowel harmony
777+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү])е", "$1ye" },
778+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)г", "$1g" },
779+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)к", "$1k" },
780+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)ю", "$1yü" },
781+
{ "([ÄäEeİiÖöÜüӘәЕеИиӨөҮү][^ '\"-]*)я", "$1yä" },
782+
{ "([AaIıOoUuАаЫыОоУу])е", "$1yı" },
783+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)г", "$1ğ" },
784+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)к", "$1q" },
785+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)ю", "$1yu" },
786+
{ "([AaIıOoUuАаЫыОоУу][^ '\"-]*)я", "$1ya" },
787+
{ "г([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "g$1" },
788+
{ "г([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "ğ$1" },
789+
{ "к([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "k$1" },
790+
{ "к([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "q$1" },
791+
{ "ю([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "yü$1" },
792+
{ "ю([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "yu$1" },
793+
{ "я([^ '\"-]*[ÄäEeİiÖöÜüӘәЕеИиӨөҮү])", "yä$1" },
794+
{ "я([^ '\"-]*[AaIıOoUuАаЫыОоУу])", "ya$1" },
795+
796+
//// Uppercase exceptions
797+
//{ "Аъ", "Ä" },
798+
//{ "Ӓ", "Ä" },
799+
//{ "Оъ", "Ö" },
800+
//{ "Ӧ", "Ö" },
801+
//{ "Уъ", "Ü" },
802+
//{ "Ӱ", "Ü" },
803+
//{ "Жъ", "C" },
804+
//{ "Нъ", "Ñ" },
805+
//{ "Ҥ", "Ñ" },
806+
//{ "Хъ", "H" },
807+
808+
// Uppercase letters
809+
{ "А", "A" },
810+
{ "Б", "B" },
811+
{ "В", "W" }, // Or V in Russian words
812+
{ "Д", "D" },
813+
{ "Е", "E" },
814+
{ "Ж", "J" },
815+
{ "З", "Z" },
816+
{ "И", "İ" },
817+
{ "Й", "Y" },
818+
{ "Л", "L" },
819+
{ "М", "M" },
820+
{ "Н", "N" },
821+
{ "О", "O" },
822+
{ "П", "P" },
823+
{ "Р", "R" },
824+
{ "С", "S" },
825+
{ "Т", "T" },
826+
{ "У", "U" },
827+
{ "Ф", "F" },
828+
{ "Х", "X" },
829+
{ "Ч", "Ç" },
830+
{ "Ш", "Ş" },
831+
{ "Ы", "I" },
832+
{ "Ә", "Ä" },
833+
{ "Ө", "Ö" },
834+
{ "Ү", "Ü" },
835+
{ "Җ", "C" },
836+
{ "Ң", "Ñ" },
837+
{ "Һ", "H" },
838+
839+
// Uppercase letters - Russian
840+
{ "Ё", "Yo" },
841+
{ "Ц", "Ts" },
842+
{ "Щ", "Şç" },
843+
844+
//// Lowercase exceptions
845+
//{ "аъ", "ä" },
846+
//{ "ӓ", "ä" },
847+
//{ "оъ", "ö" },
848+
//{ "ӧ", "ö" },
849+
//{ "уъ", "ü" },
850+
//{ "ӱ", "ü" },
851+
//{ "жъ", "c" },
852+
//{ "нъ", "ñ" },
853+
//{ "ҥ", "ñ" },
854+
//{ "хъ", "h" },
855+
856+
// Lowercase letters
857+
{ "а", "a" },
858+
{ "б", "b" },
859+
{ "в", "w" }, // Or V in Russian words
860+
{ "д", "d" },
861+
{ "е", "e" },
862+
{ "ж", "j" },
863+
{ "з", "z" },
864+
{ "и", "i" },
865+
{ "й", "y" },
866+
{ "л", "l" },
867+
{ "м", "m" },
868+
{ "н", "n" },
869+
{ "о", "o" },
870+
{ "п", "p" },
871+
{ "р", "r" },
872+
{ "с", "s" },
873+
{ "т", "t" },
874+
{ "у", "u" },
875+
{ "ф", "f" },
876+
{ "х", "x" },
877+
{ "ч", "ç" },
878+
{ "ш", "ş" },
879+
{ "ы", "ı" },
880+
{ "ә", "ä" },
881+
{ "ө", "ö" },
882+
{ "ү", "ü" },
883+
{ "җ", "c" },
884+
{ "ң", "ñ" },
885+
{ "һ", "h" },
886+
887+
// Lowercase letters - Russian
888+
{ "ё", "yo" },
889+
{ "ц", "ts" },
890+
{ "щ", "şç" },
891+
892+
// Special characters
893+
{ "Ъ", "" },
894+
{ "Ь", "" },
895+
{ "ъ", "" },
896+
{ "ь", "" },
897+
};
898+
750899
ukrainianTransliterationTable = new()
751900
{
752901
{ @"ія\b", "ia" },
@@ -812,6 +961,11 @@ public CyrillicTransliterator()
812961
serbianTransliterationTable.Add(characterTransliteration.Key, characterTransliteration.Value);
813962
}
814963

964+
if (!tatarTransliterationTable.ContainsKey(characterTransliteration.Key))
965+
{
966+
tatarTransliterationTable.Add(characterTransliteration.Key, characterTransliteration.Value);
967+
}
968+
815969
if (!tajikTransliterationTable.ContainsKey(characterTransliteration.Key))
816970
{
817971
tajikTransliterationTable.Add(characterTransliteration.Key, characterTransliteration.Value);
@@ -883,6 +1037,11 @@ public string Transliterate(string text, Language language)
8831037
{
8841038
transliterationTable = tajikTransliterationTable;
8851039
}
1040+
else if (language.Equals(Language.Tatar) ||
1041+
language.Equals(Language.TatarCyrillic))
1042+
{
1043+
transliterationTable = tatarTransliterationTable;
1044+
}
8861045
else if (language.Equals(Language.Ukrainian))
8871046
{
8881047
transliterationTable = ukrainianTransliterationTable;

0 commit comments

Comments
 (0)