Skip to content

Commit e97efcb

Browse files
Ron PetrushaBillWagner
authored andcommitted
Cyrillic to Latin transliteration sample (#1628)
1 parent 1e7d0a8 commit e97efcb

File tree

8 files changed

+575
-0
lines changed

8 files changed

+575
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
using System;
2+
using System.IO;
3+
using System.Text;
4+
5+
class ConsoleModule
6+
{
7+
static void Main()
8+
{
9+
string[] args = Environment.GetCommandLineArgs();
10+
11+
// Get command line arguments.
12+
if (args.Length != 3 || String.IsNullOrWhiteSpace(args[1]) || String.IsNullOrWhiteSpace(args[2]))
13+
{
14+
Console.WriteLine("There must be a source and a destination file.");
15+
ShowSyntax();
16+
return;
17+
}
18+
19+
string source = args[1];
20+
string destination = args[2];
21+
22+
if(!File.Exists(source))
23+
{
24+
Console.WriteLine("The source file does not exist.");
25+
return;
26+
}
27+
28+
try
29+
{
30+
using (var sr = new StreamReader(source))
31+
{
32+
// Check whether destination file exists and exit if it should not be overwritten.
33+
if (File.Exists(destination))
34+
{
35+
Console.Write("The destination file {1} '{0}'{1}exists. Overwrite it? (Y/N) ",
36+
source, Environment.NewLine);
37+
ConsoleKeyInfo keyPressed = Console.ReadKey(true);
38+
if (Char.ToUpper(keyPressed.KeyChar) == 'Y' | Char.ToUpper(keyPressed.KeyChar) == 'N')
39+
{
40+
Console.WriteLine(keyPressed.KeyChar);
41+
if (Char.ToUpper(keyPressed.KeyChar) == 'N')
42+
return;
43+
}
44+
}
45+
using (var sw = new StreamWriter(destination, false, System.Text.Encoding.UTF8))
46+
{
47+
// Instantiate the encoder
48+
Encoding encoding = Encoding.GetEncoding("us-ascii", new CyrillicToRomanFallback(), new DecoderExceptionFallback());
49+
// This is an encoding operation, so we only need to get the encoder.
50+
Encoder encoder = encoding.GetEncoder();
51+
Decoder decoder = encoding.GetDecoder();
52+
53+
// Define buffer to read characters
54+
char[] buffer = new char[100];
55+
int charsRead;
56+
57+
do
58+
{
59+
// Read next 100 characters from input stream.
60+
charsRead = sr.ReadBlock(buffer, 0, buffer.Length);
61+
62+
// Encode characters.
63+
int byteCount = encoder.GetByteCount(buffer, 0, charsRead, false);
64+
byte[] bytes = new byte[byteCount];
65+
int bytesWritten = encoder.GetBytes(buffer, 0, charsRead, bytes, 0, false);
66+
67+
// Decode characters back to Unicode and write to a UTF-8-encoded file.
68+
char[] charsToWrite = new char[decoder.GetCharCount(bytes, 0, byteCount)];
69+
decoder.GetChars(bytes, 0, bytesWritten, charsToWrite, 0);
70+
sw.Write(charsToWrite);
71+
} while (charsRead == buffer.Length);
72+
}
73+
}
74+
}
75+
catch (DirectoryNotFoundException e)
76+
{
77+
Console.WriteLine($"Invalid directory: {e.Message}");
78+
return;
79+
}
80+
catch (IOException e)
81+
{
82+
Console.WriteLine($"I/O exception: {e.Message}");
83+
return;
84+
}
85+
}
86+
87+
private static void ShowSyntax()
88+
{
89+
Console.WriteLine("\nSyntax: CyrillicToRoman <source> <destination>");
90+
Console.WriteLine(" where <source> = source filename");
91+
Console.WriteLine(" <destination> = destination filename\n");
92+
}
93+
}
94+
95+
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp3.0</TargetFramework>
6+
</PropertyGroup>
7+
8+
<ItemGroup>
9+
<PackageReference Include="System.Text.Encoding.CodePages" Version="4.6.0" />
10+
</ItemGroup>
11+
12+
</Project>
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Text;
5+
6+
class CyrillicToLatinFallback : EncoderFallback
7+
{
8+
private Dictionary<Char, String> table;
9+
10+
public CyrillicToLatinFallback()
11+
{
12+
table = new Dictionary<Char, String>();
13+
// Define mappings.
14+
// Uppercase modern Cyrillic characters.
15+
table.Add('\u0410', "A");
16+
table.Add('\u0411', "B");
17+
table.Add('\u0412', "V");
18+
table.Add('\u0413', "G");
19+
table.Add('\u0414', "D");
20+
table.Add('\u0415', "E");
21+
table.Add('\u0416', "Zh");
22+
table.Add('\u0417', "Z");
23+
table.Add('\u0418', "I");
24+
table.Add('\u0419', "I");
25+
table.Add('\u041A', "K");
26+
table.Add('\u041B', "L");
27+
table.Add('\u041C', "M");
28+
table.Add('\u041D', "N");
29+
table.Add('\u041E', "O");
30+
table.Add('\u041F', "P");
31+
table.Add('\u0420', "R");
32+
table.Add('\u0421', "S");
33+
table.Add('\u0422', "T");
34+
table.Add('\u0423', "U");
35+
table.Add('\u0424', "F");
36+
table.Add('\u0425', "Kh");
37+
table.Add('\u0426', "Ts");
38+
table.Add('\u0427', "Ch");
39+
table.Add('\u0428', "Sh");
40+
table.Add('\u0429', "Shch");
41+
table.Add('\u042A', "'"); // Hard sign
42+
table.Add('\u042B', "Ye");
43+
table.Add('\u042C', "'"); // Soft sign
44+
table.Add('\u042D', "E");
45+
table.Add('\u042E', "Iu");
46+
table.Add('\u042F', "Ia");
47+
// Lowercase modern Cyrillic characters.
48+
table.Add('\u0430', "a");
49+
table.Add('\u0431', "b");
50+
table.Add('\u0432', "v");
51+
table.Add('\u0433', "g");
52+
table.Add('\u0434', "d");
53+
table.Add('\u0435', "e");
54+
table.Add('\u0436', "zh");
55+
table.Add('\u0437', "z");
56+
table.Add('\u0438', "i");
57+
table.Add('\u0439', "i");
58+
table.Add('\u043A', "k");
59+
table.Add('\u043B', "l");
60+
table.Add('\u043C', "m");
61+
table.Add('\u043D', "n");
62+
table.Add('\u043E', "o");
63+
table.Add('\u043F', "p");
64+
table.Add('\u0440', "r");
65+
table.Add('\u0441', "s");
66+
table.Add('\u0442', "t");
67+
table.Add('\u0443', "u");
68+
table.Add('\u0444', "f");
69+
table.Add('\u0445', "kh");
70+
table.Add('\u0446', "ts");
71+
table.Add('\u0447', "ch");
72+
table.Add('\u0448', "sh");
73+
table.Add('\u0449', "shch");
74+
table.Add('\u044A', "'"); // Hard sign
75+
table.Add('\u044B', "yi");
76+
table.Add('\u044C', "'"); // Soft sign
77+
table.Add('\u044D', "e");
78+
table.Add('\u044E', "iu");
79+
table.Add('\u044F', "ia");
80+
}
81+
82+
public override EncoderFallbackBuffer CreateFallbackBuffer()
83+
{
84+
return new CyrillicToLatinFallbackBuffer(table);
85+
}
86+
87+
public override int MaxCharCount
88+
{
89+
get { return 4; } // Maximum is "Shch" and "shch"
90+
}
91+
}
92+
93+
public class CyrillicToLatinFallbackBuffer : EncoderFallbackBuffer
94+
{
95+
private Dictionary<Char, String> table;
96+
private int bufferIndex;
97+
private string buffer;
98+
private int leftToReturn;
99+
100+
internal CyrillicToLatinFallbackBuffer(Dictionary<Char, String> table)
101+
{
102+
this.table = table;
103+
this.bufferIndex = -1;
104+
this.leftToReturn = -1;
105+
}
106+
107+
public override bool Fallback(char charUnknownHigh, char charUnknownLow, int index)
108+
{
109+
// There's no need to handle surrogates.
110+
return false;
111+
}
112+
113+
public override bool Fallback(char charUnknown, int index)
114+
{
115+
if (charUnknown >= '\u0410' & charUnknown <= '\u044F')
116+
{
117+
buffer = table[charUnknown];
118+
leftToReturn = buffer.Length - 1;
119+
bufferIndex = -1;
120+
return true;
121+
}
122+
return false;
123+
}
124+
125+
public override char GetNextChar()
126+
{
127+
char charToReturn;
128+
if (leftToReturn >= 0) {
129+
leftToReturn--;
130+
bufferIndex++;
131+
charToReturn = buffer[bufferIndex];
132+
}
133+
else
134+
{
135+
charToReturn = '\u0000';
136+
}
137+
return charToReturn;
138+
}
139+
140+
public override bool MovePrevious()
141+
{
142+
if (bufferIndex > 0)
143+
{
144+
bufferIndex--;
145+
leftToReturn++;
146+
return true;
147+
}
148+
return false;
149+
}
150+
151+
public override int Remaining
152+
{
153+
get { return leftToReturn; }
154+
}
155+
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
---
2+
languages:
3+
- csharp
4+
products:
5+
- dotnet-core
6+
page_type: sample
7+
name: ".NET Core Cyrillic to Latin Transliteration Utility (C#)"
8+
urlFragment: "cyrillic-transliteration-cs"
9+
description: "A .NET Core console application written in C# that uses the encoding fallback functionality to transliterate Cyrillic to Latin characters."
10+
---
11+
cyrillic-to-latin is a command-line utility that transliterates modern Cyrillic characters
12+
to their Latin equivalents. It uses a modified Library of Congress system for
13+
transliteration. Its syntax is:
14+
15+
```
16+
CyrillicToLatin <sourceFile> <destinationFile>
17+
```
18+
19+
where *sourceFile* is the path and filename of a text file that contains modern Cyrillic
20+
characters, and *destinationFile* is the name of the text file that will store the
21+
original text with its Cyrillic characters replaced by transliterated Latin characters.
22+
If a file path is included in *destinationFile* and any portion of that path does
23+
not exist, the utility terminates.
24+
25+
The specific mappings of upper- and lower-case Cyrillic characters
26+
to Latin characters are listed in the constructor of the `CyrillicToLatinFallback`
27+
class, where the entries of a case mapping table named `table` are defined.
28+
29+
The utility illustrates the extensibility of character encoding in the .NET
30+
Framework. An encoding system consists of an encoder and a decoder. The encoder is
31+
responsible for translating a sequence of characters into a sequence of bytes. The
32+
decoder is responsible for translating the sequence of bytes into a sequence of
33+
characters. .NET Core supports ASCII as well as the standard Unicode
34+
encodings and allows the [Encoding](https://docs.microsoft.com/dotnet/api/system.text.encoding) class to be overridden to support otherwise
35+
unsupported encodings. It also allows an encoder and a decoder's handling of
36+
unmapped characters and bytes to be customized. Broadly, an encoder or a decoder can handle data that it cannot map by throwing an exception or by using some alternate mapping. For more information, see [Character Encoding in .NET Framework](https://docs.microsoft.com/dotnet/standard/base-types/character-encoding).
37+
38+
The transliteration utility works by instantiating an [Encoding](https://docs.microsoft.com/dotnet/api/system.text.encoding) object that represents ASCII encoding, which supports ASCII characters in the range from U+00 to U+FF. Because modern Cyrillic characters occupy the range from U+0410 to U+044F, they do not automatically map to ASCII encoding. When the utility instantiates its Encoding object, it passes its constructor an instance of a class named `CyrillicToLatinFallback` that is derived from [EncoderFallback](https://docs.microsoft.com/dotnet/api/system.text.encoderfallback). This class maintains an internal table that maps modern Cyrillic characters to one or more Latin characters.
39+
40+
When the encoder encounters a character that it cannot encode, it calls the fallback
41+
object's [CreateFallbackBuffer](https://docs.microsoft.com/dotnet/api/system.text.encoderfallback.createfallbackbuffer) method. This method instantiates a `CyrillicToLatinFallbackBuffer` object (a subclass of the [EncoderFallbackBuffer](https://docs.microsoft.com/dotnet/api/system.text.encoderfallbackbuffer) class) and passes its constructor
42+
the modern Cyrillic character mapping table. It then passes the `CyrillicToLatinFallbackBuffer`
43+
object's [Fallback](https://docs.microsoft.com/dotnet/api/system.text.encoderfallbackbuffer.fallback) method each character that it is unable to encode, and if a mapping is available, the method can provide a suitable replacement.
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
Imports System.IO
2+
Imports System.Text
3+
4+
Module ConsoleModule
5+
Sub Main()
6+
Dim args() As String = Environment.GetCommandLineArgs()
7+
8+
' Get command line arguments.
9+
If args.Length <> 3 OrElse String.IsNullOrWhitespace(args(1)) OrElse String.IsNullOrWhitespace(args(2)) Then
10+
Console.WriteLine("There must be a source and a destination file.") : ShowSyntax()
11+
Exit Sub
12+
End if
13+
14+
Dim source As String = args(1)
15+
Dim destination As String = args(2)
16+
17+
If Not File.Exists(source) Then
18+
Console.WriteLine($"The source file {vbCrLf} '{source}'{vbCrLf}cannot be found.") : ShowSyntax()
19+
Exit Sub
20+
End if
21+
22+
Try
23+
Using sr As New StreamReader(source)
24+
25+
' Check whether destination file exists and exit if it should not be overwritten.
26+
If File.Exists(destination) Then
27+
Console.Write("The destination file {1} '{0}'{1}exists. Overwrite it? (Y/N) ", source, vbCrLf)
28+
Dim keyPressed As ConsoleKeyInfo = Console.ReadKey(True)
29+
If Char.ToUpper(keyPressed.KeyChar) = "Y"c Or Char.ToUpper(keyPressed.KeyChar) = "N"c Then
30+
Console.WriteLine(keyPressed.KeyChar)
31+
If Char.ToUpper(keyPressed.KeyChar) = "N" Then Exit Sub
32+
End If
33+
End If
34+
Using sw As New StreamWriter(destination, False, Encoding.UTF8)
35+
' Instantiate the encoder
36+
Dim encoding As Encoding = encoding.GetEncoding("us-ascii", New CyrillicToLatinFallback(), New DecoderExceptionFallback())
37+
' This is an encoding operation, so we only need to get the encoder.
38+
Dim encoder As Encoder = encoding.GetEncoder()
39+
Dim decoder As Decoder = encoding.GetDecoder()
40+
41+
' Define buffer to read characters
42+
Dim buffer(99) As Char
43+
Dim charsRead As Integer
44+
45+
Do
46+
' Read next 100 characters from input stream.
47+
charsRead = sr.ReadBlock(buffer, 0, buffer.Length)
48+
49+
' Encode characters.
50+
Dim byteCount As Integer = encoder.GetByteCount(buffer, 0, charsRead, False)
51+
Dim bytes(byteCount - 1) As Byte
52+
Dim bytesWritten As Integer = encoder.GetBytes(buffer, 0, charsRead, bytes, 0, False)
53+
54+
' Decode characters back to Unicode and write to a UTF-8-encoded file.
55+
Dim charsToWrite(decoder.GetCharCount(bytes, 0, byteCount)) As Char
56+
decoder.GetChars(bytes, 0, bytesWritten, charsToWrite, 0)
57+
sw.Write(charsToWrite)
58+
Loop While charsRead = buffer.Length
59+
End Using
60+
End Using
61+
Catch e As DirectoryNotFoundException
62+
Console.WriteLine($"Invalid directory: {e.Message}")
63+
Catch e As IOException
64+
Console.WriteLine($"I/O exception: {e.Message}")
65+
End Try
66+
End Sub
67+
68+
Private Sub ShowSyntax()
69+
Console.WriteLine()
70+
Console.WriteLine("Syntax: CyrillicToRoman <source> <destination>")
71+
Console.WriteLine(" where <source> = source filename")
72+
Console.WriteLine(" <destination> = destination filename")
73+
Console.WriteLine()
74+
End Sub
75+
End Module
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<RootNamespace>vb</RootNamespace>
6+
<TargetFramework>netcoreapp3.0</TargetFramework>
7+
</PropertyGroup>
8+
9+
</Project>

0 commit comments

Comments
 (0)