Skip to content

Commit 475e9c5

Browse files
committed
Added support for kRSUnicode/Unicode_Radical_Stroke.
1 parent 76e5b5f commit 475e9c5

File tree

12 files changed

+208
-10
lines changed

12 files changed

+208
-10
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ This library includes a subset of the official [Unicode Character Database](http
123123
* ID_Continue
124124
* XID_Start
125125
* XID_Continue
126+
* Unicode_Radical_Stroke (This is actually kRSUnicode from the Unihan database)
126127
* Code point cross references extracted from NamesList.txt
127128

128129
NB: The UCD property ISO_Comment will never be included since this one is empty in all new Unicode versions.
@@ -131,6 +132,7 @@ NB: The UCD property ISO_Comment will never be included since this one is empty
131132
* kAccountingNumeric
132133
* kOtherNumeric
133134
* kPrimaryNumeric
135+
* kRSUnicode
134136
* kDefinition
135137
* kMandarin
136138
* kCantonese

UnicodeInformation.Builder/Program.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ internal class Program
3232
"Unihan_NumericValues.txt",
3333
"Unihan_Readings.txt",
3434
"Unihan_Variants.txt",
35+
"Unihan_IRGSources.txt",
3536
};
3637

3738
private static HttpMessageHandler httpMessageHandler;

UnicodeInformation.Builder/UnicodeDataProcessor.cs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ internal class UnicodeDataProcessor
1919
public const string UnihanReadingsFileName = "Unihan_Readings.txt";
2020
public const string UnihanVariantsFileName = "Unihan_Variants.txt";
2121
public const string UnihanNumericValuesFileName = "Unihan_NumericValues.txt";
22+
public const string UnihanIrgSourcesFileName = "Unihan_IRGSources.txt";
2223

2324
private static string ParseSimpleCaseMapping(string mapping)
2425
{
@@ -45,6 +46,7 @@ public static async Task<UnicodeInfoBuilder> BuildDataAsync(IDataSource ucdSourc
4546
await ProcessUnihanReadings(unihanSource, builder).ConfigureAwait(false);
4647
await ProcessUnihanVariants(unihanSource, builder).ConfigureAwait(false);
4748
await ProcessUnihanNumericValues(unihanSource, builder).ConfigureAwait(false);
49+
await ProcessUnihanIrgSources(unihanSource, builder).ConfigureAwait(false);
4850

4951
return builder;
5052
}
@@ -438,5 +440,48 @@ private static async Task ProcessUnihanNumericValues(IDataSource unihanDataSourc
438440
}
439441
}
440442
}
443+
444+
private static async Task ProcessUnihanIrgSources(IDataSource unihanDataSource, UnicodeInfoBuilder builder)
445+
{
446+
using (var reader = new UnihanDataFileReader(await unihanDataSource.OpenDataFileAsync(UnihanIrgSourcesFileName).ConfigureAwait(false)))
447+
{
448+
while (reader.Read())
449+
{
450+
switch (reader.PropertyName)
451+
{
452+
case UnihanProperty.kRSUnicode:
453+
var entry = builder.GetUnihan(reader.CodePoint);
454+
var values = reader.PropertyValue.Split(' ');
455+
456+
foreach (var value in values)
457+
{
458+
bool isSimplified = false;
459+
int index;
460+
461+
for (int i = 0; i < value.Length; ++i)
462+
{
463+
switch (value[i])
464+
{
465+
case '\'':
466+
isSimplified = true;
467+
goto case '.';
468+
case '.':
469+
index = i;
470+
goto SeparatorFound;
471+
}
472+
}
473+
throw new InvalidDataException("Failed to decode value for kRSUnicode / Unicode_Radical_Stroke.");
474+
475+
SeparatorFound: ;
476+
entry.UnicodeRadicalStrokeCounts.Add(new UnicodeRadicalStrokeCount(byte.Parse(value.Substring(0, index), NumberStyles.None), byte.Parse(value.Substring(index + (isSimplified ? 2 : 1)), NumberStyles.None), isSimplified));
477+
}
478+
break;
479+
default:
480+
// Ignore unhandled properties for now.
481+
break;
482+
}
483+
}
484+
}
485+
}
441486
}
442487
}

UnicodeInformation.Builder/UnihanCharacterDataBuilder.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ internal sealed class UnihanCharacterDataBuilder
2323
private string simplifiedVariant;
2424
private string traditionalVariant;
2525

26+
private readonly List<UnicodeRadicalStrokeCount> unicodeRadicalStrokeCounts = new List<UnicodeRadicalStrokeCount>();
27+
2628
public int CodePoint { get { return codePoint; } }
2729
public UnihanNumericType NumericType { get { return numericType; } set { numericType = value; } }
2830
public long NumericValue { get { return numericValue; } set { numericValue = value; } }
@@ -36,6 +38,7 @@ internal sealed class UnihanCharacterDataBuilder
3638
public string VietnameseReading { get { return vietnameseReading; } set { vietnameseReading = value; } }
3739
public string SimplifiedVariant { get { return simplifiedVariant; } set { simplifiedVariant = value; } }
3840
public string TraditionalVariant { get { return traditionalVariant; } set { traditionalVariant = value; } }
41+
public ICollection<UnicodeRadicalStrokeCount> UnicodeRadicalStrokeCounts { get { return unicodeRadicalStrokeCounts; } }
3942

4043
internal UnihanCharacterDataBuilder(int codePoint)
4144
{
@@ -49,6 +52,7 @@ internal UnihanCharacterData ToCharacterData()
4952
codePoint,
5053
numericType,
5154
numericValue,
55+
unicodeRadicalStrokeCounts.ToArray(),
5256
definition,
5357
mandarinReading,
5458
cantoneseReading,
@@ -67,6 +71,14 @@ internal void WriteToFile(BinaryWriter writer)
6771
UnihanFields fields = default(UnihanFields);
6872

6973
fields |= (UnihanFields)NumericType;
74+
// For now, we have enough bits to encode the length of the array in the field specifier, so we'll do that.
75+
// (NB: A quick analysis of the files revealed thare there are almost always exactly one Radical/Stroke count, and occasionally two, yet never more.)
76+
if (unicodeRadicalStrokeCounts.Count > 0)
77+
{
78+
if (unicodeRadicalStrokeCounts.Count == 1) fields |= UnihanFields.UnicodeRadicalStrokeCount;
79+
else if (unicodeRadicalStrokeCounts.Count == 2) fields |= UnihanFields.UnicodeRadicalStrokeCountTwice;
80+
else fields |= UnihanFields.UnicodeRadicalStrokeCount | UnihanFields.UnicodeRadicalStrokeCountTwice;
81+
}
7082
if (Definition != null) fields |= UnihanFields.Definition;
7183
if (MandarinReading != null) fields |= UnihanFields.MandarinReading;
7284
if (CantoneseReading != null) fields |= UnihanFields.CantoneseReading;
@@ -81,6 +93,17 @@ internal void WriteToFile(BinaryWriter writer)
8193
writer.Write((ushort)fields);
8294

8395
writer.WriteCodePoint(UnihanCharacterData.PackCodePoint(codePoint));
96+
if ((fields & UnihanFields.UnicodeRadicalStrokeCountMore) != 0)
97+
{
98+
if ((fields & (UnihanFields.UnicodeRadicalStrokeCountMore)) == UnihanFields.UnicodeRadicalStrokeCountMore)
99+
writer.Write(checked((byte)(unicodeRadicalStrokeCounts.Count - 3)));
100+
101+
foreach (var radicalStrokeCount in unicodeRadicalStrokeCounts)
102+
{
103+
writer.Write(radicalStrokeCount.Radical);
104+
writer.Write((byte)(radicalStrokeCount.StrokeCount | (radicalStrokeCount.IsSimplified ? 0x80 : 0)));
105+
}
106+
}
84107
if ((fields & UnihanFields.OtherNumeric) != 0) writer.Write(numericValue);
85108
if ((fields & UnihanFields.Definition) != 0) writer.Write(Definition);
86109
if ((fields & UnihanFields.MandarinReading) != 0) writer.Write(MandarinReading);

UnicodeInformation/UnicodeCharInfo.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ public struct UnicodeCharInfo
5252
public ContributoryProperties ContributoryProperties { get { return unicodeCharacterData?.ContributoryProperties ?? 0; } }
5353
public CoreProperties CoreProperties { get { return unicodeCharacterData?.CoreProperties ?? 0; } }
5454
public UnicodeCrossReferenceCollection CrossRerefences { get { return new UnicodeCrossReferenceCollection(unicodeCharacterData?.CrossRerefences); } }
55+
[ValueName("kRSUnicode"), ValueName("cjkRSUnicode"), ValueName("Unicode_Radical_Stroke"), ValueName("URS")]
56+
public UnicodeRadicalStrokeCountCollection UnicodeRadicalStrokeCounts { get { return new UnicodeRadicalStrokeCountCollection(unihanCharacterData?.UnicodeRadicalStrokeCounts); } }
5557

5658
[ValueName("kDefinition")]
5759
public string Definition { get { return unihanCharacterData?.Definition; } }

UnicodeInformation/UnicodeInfo.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,18 @@ private static UnihanCharacterData ReadUnihanCharacterDataEntry(BinaryReader rea
185185
reader.ReadInt64() :
186186
0;
187187

188+
UnicodeRadicalStrokeCount[] unicodeRadicalStrokeCounts = (fields & UnihanFields.UnicodeRadicalStrokeCountMore) != 0 ?
189+
new UnicodeRadicalStrokeCount
190+
[
191+
(fields & UnihanFields.UnicodeRadicalStrokeCountMore) == UnihanFields.UnicodeRadicalStrokeCountMore ?
192+
reader.ReadByte() + 3 :
193+
((byte)(fields & UnihanFields.UnicodeRadicalStrokeCountMore) >> 2)
194+
] :
195+
UnicodeRadicalStrokeCount.EmptyArray;
196+
197+
for (int i = 0; i < unicodeRadicalStrokeCounts.Length; ++i)
198+
unicodeRadicalStrokeCounts[i] = new UnicodeRadicalStrokeCount(reader.ReadByte(), reader.ReadByte());
199+
188200
string definition = (fields & UnihanFields.Definition) != 0 ? reader.ReadString() : null;
189201
string mandarinReading = (fields & UnihanFields.MandarinReading) != 0 ? reader.ReadString() : null;
190202
string cantoneseReading = (fields & UnihanFields.CantoneseReading) != 0 ? reader.ReadString() : null;
@@ -201,6 +213,7 @@ private static UnihanCharacterData ReadUnihanCharacterDataEntry(BinaryReader rea
201213
codePoint,
202214
numericType,
203215
numericValue,
216+
unicodeRadicalStrokeCounts,
204217
definition,
205218
mandarinReading,
206219
cantoneseReading,

UnicodeInformation/UnicodeInformation.csproj

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
<Compile Include="CoreProperties.cs" />
6868
<Compile Include="HangulInfo.cs" />
6969
<Compile Include="UnicodeCrossReferenceCollection.cs" />
70+
<Compile Include="UnicodeRadicalStrokeCountCollection.cs" />
7071
<Compile Include="UnicodeNameAliasCollection.cs" />
7172
<Compile Include="UcdFields.cs" />
7273
<Compile Include="EnumHelper.cs" />
@@ -81,6 +82,7 @@
8182
<Compile Include="UnicodeInfo.cs" />
8283
<Compile Include="UnicodeNameAlias.cs" />
8384
<Compile Include="UnicodeNameAliasKind.cs" />
85+
<Compile Include="UnicodeRadicalStrokeCount.cs" />
8486
<Compile Include="UnihanNumericType.cs" />
8587
<Compile Include="UnicodeNumericType.cs" />
8688
<Compile Include="UnihanCharacterData.cs" />
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Text;
5+
using System.Threading.Tasks;
6+
7+
namespace System.Unicode
8+
{
9+
public struct UnicodeRadicalStrokeCount
10+
{
11+
internal static readonly UnicodeRadicalStrokeCount[] EmptyArray = new UnicodeRadicalStrokeCount[0];
12+
13+
private readonly byte radical;
14+
private readonly byte strokeCount;
15+
16+
internal UnicodeRadicalStrokeCount(byte rawRadical, byte rawStrokeCount)
17+
{
18+
radical = rawRadical;
19+
strokeCount = rawStrokeCount;
20+
}
21+
22+
internal UnicodeRadicalStrokeCount(byte radical, byte strokeCount, bool isSimplified)
23+
{
24+
this.radical = radical;
25+
this.strokeCount = strokeCount;
26+
27+
if (isSimplified) this.strokeCount |= 0x80;
28+
}
29+
30+
public byte Radical { get { return radical; } }
31+
public byte StrokeCount { get { return strokeCount; } }
32+
public bool IsSimplified { get { return (strokeCount & 0x80) != 0; } }
33+
}
34+
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
using System;
2+
using System.Collections;
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
8+
namespace System.Unicode
9+
{
10+
public struct UnicodeRadicalStrokeCountCollection : IList<UnicodeRadicalStrokeCount>
11+
{
12+
public struct Enumerator : IEnumerator<UnicodeRadicalStrokeCount>
13+
{
14+
private readonly UnicodeRadicalStrokeCount[] items;
15+
private int index;
16+
17+
internal Enumerator(UnicodeRadicalStrokeCount[] items)
18+
{
19+
this.items = items;
20+
this.index = -1;
21+
}
22+
23+
public void Dispose() { }
24+
25+
public UnicodeRadicalStrokeCount Current { get { return items[index]; } }
26+
object IEnumerator.Current { get { return Current; } }
27+
28+
public bool MoveNext() { return index < items.Length && ++index < items.Length; }
29+
30+
void IEnumerator.Reset() { this.index = -1; }
31+
}
32+
33+
private readonly UnicodeRadicalStrokeCount[] items;
34+
35+
public UnicodeRadicalStrokeCountCollection() { items = UnicodeRadicalStrokeCount.EmptyArray; }
36+
internal UnicodeRadicalStrokeCountCollection(UnicodeRadicalStrokeCount[] items) { this.items = items ?? UnicodeRadicalStrokeCount.EmptyArray; }
37+
38+
public UnicodeRadicalStrokeCount this[int index] { get { return items[index]; } }
39+
40+
UnicodeRadicalStrokeCount IList<UnicodeRadicalStrokeCount>.this[int index]
41+
{
42+
get { return items[index]; }
43+
set { throw new NotSupportedException(); }
44+
}
45+
46+
public int Count { get { return items.Length; } }
47+
48+
bool ICollection<UnicodeRadicalStrokeCount>.IsReadOnly { get { return true; } }
49+
50+
public void Add(UnicodeRadicalStrokeCount item) { throw new NotSupportedException(); }
51+
public void Insert(int index, UnicodeRadicalStrokeCount item) { throw new NotSupportedException(); }
52+
53+
public bool Remove(UnicodeRadicalStrokeCount item) { throw new NotSupportedException(); }
54+
public void RemoveAt(int index) { throw new NotSupportedException(); }
55+
56+
public void Clear() { throw new NotSupportedException(); }
57+
58+
public int IndexOf(UnicodeRadicalStrokeCount item) { return Array.IndexOf(items, item); }
59+
public bool Contains(UnicodeRadicalStrokeCount item) { return IndexOf(item) >= 0; }
60+
61+
public void CopyTo(UnicodeRadicalStrokeCount[] array, int arrayIndex) { items.CopyTo(array, arrayIndex); }
62+
63+
public Enumerator GetEnumerator() { return new Enumerator(items); }
64+
65+
IEnumerator<UnicodeRadicalStrokeCount> IEnumerable<UnicodeRadicalStrokeCount>.GetEnumerator() { return GetEnumerator(); }
66+
IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); }
67+
}
68+
}

UnicodeInformation/UnihanCharacterData.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ internal sealed class UnihanCharacterData
1111
public readonly int CodePoint;
1212
public readonly UnihanNumericType NumericType;
1313
public readonly long NumericValue;
14+
public readonly UnicodeRadicalStrokeCount[] UnicodeRadicalStrokeCounts;
1415
public readonly string Definition;
1516
public readonly string MandarinReading;
1617
public readonly string CantoneseReading;
@@ -27,6 +28,7 @@ internal UnihanCharacterData
2728
int codePoint,
2829
UnihanNumericType numericType,
2930
long numericValue,
31+
UnicodeRadicalStrokeCount[] unicodeRadicalStrokeCounts,
3032
string definition,
3133
string mandarinReading,
3234
string cantoneseReading,
@@ -42,6 +44,7 @@ string traditionalVariant
4244
CodePoint = codePoint;
4345
NumericType = numericType;
4446
NumericValue = numericValue;
47+
UnicodeRadicalStrokeCounts = unicodeRadicalStrokeCounts;
4548
Definition = definition;
4649
MandarinReading = mandarinReading;
4750
CantoneseReading = cantoneseReading;

0 commit comments

Comments
 (0)