Skip to content

Commit b86d9cf

Browse files
committed
feat: Improve acronyms detection
1 parent 5cc4432 commit b86d9cf

File tree

3 files changed

+191
-11
lines changed

3 files changed

+191
-11
lines changed

Sources/Casification/Casification.swift

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ extension String {
2727
"xml", "Xml", "XML",
2828
"yaml", "Yaml", "YAML", // todo: add more extensions
2929
"sf", "SF",
30+
"ns", "NS",
3031
"ui", "UI",
3132
"ux", "UX",
3233
"sk", "SK" // todo: add more system prefixes
@@ -41,7 +42,7 @@ extension String.Casification {
4142
func tokenize(_ input: Substring) -> [Token]
4243
}
4344

44-
public struct Token: Hashable {
45+
public struct Token: Hashable, CustomStringConvertible {
4546
public var value: Substring
4647
public var kind: Kind
4748

@@ -50,11 +51,24 @@ extension String.Casification {
5051
self.kind = kind
5152
}
5253

53-
public enum Kind: Hashable {
54+
public enum Kind: Hashable, CustomStringConvertible {
5455
case word
5556
case number
5657
case acronym
5758
case separator
59+
60+
public var description: String {
61+
switch self {
62+
case .word: "word"
63+
case .number: "number"
64+
case .acronym: "acronym"
65+
case .separator: "separator"
66+
}
67+
}
68+
}
69+
70+
public var description: String {
71+
return ".\(kind)(\"\(value)\")"
5872
}
5973
}
6074
}

Sources/Casification/Tokenization/DefaultTokenizer.swift

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ extension String.Casification.Tokenizers {
5959
tokens.append(.init(value, kind: kind))
6060
}
6161

62-
func findAcronym(at index: String.Index) -> Substring? {
62+
func getAcronym(at index: String.Index) -> Substring? {
6363
config.acronyms.first { acronym in
6464
guard let end = input.index(index, offsetBy: acronym.count, limitedBy: input.endIndex)
6565
else { return false }
@@ -87,7 +87,7 @@ extension String.Casification.Tokenizers {
8787
}
8888

8989
// Recursively allow split if another acronym follows
90-
return findAcronym(at: end) != nil
90+
return getAcronym(at: end) != nil
9191
}.map { acronym in
9292
let end = input.index(index, offsetBy: acronym.count)
9393
return input[index..<end]
@@ -96,14 +96,32 @@ extension String.Casification.Tokenizers {
9696

9797
while currentIndex < input.endIndex {
9898
do { // match acronyms
99-
if let acronym = findAcronym(at: currentIndex) {
100-
commitToken(upTo: currentIndex)
99+
if let acronym = getAcronym(at: currentIndex) {
100+
var isSuffixOfOtherToken: Bool {
101+
if currentStart == currentIndex { return false }
101102

102-
let end = input.index(currentIndex, offsetBy: acronym.count)
103-
commitToken(input[currentIndex..<end], kind: .acronym)
104-
currentIndex = end
105-
currentStart = end
106-
continue
103+
let prevIdx = input.index(
104+
before: currentIndex,
105+
limitedBy: input.startIndex
106+
)
107+
108+
guard let prevIdx else { return false }
109+
110+
let prevChar = input[prevIdx]
111+
let currChar = input[currentIndex]
112+
113+
return prevChar.isUppercase || currChar.isLowercase
114+
}
115+
116+
if !isSuffixOfOtherToken {
117+
commitToken(upTo: currentIndex)
118+
119+
let end = input.index(currentIndex, offsetBy: acronym.count)
120+
commitToken(input[currentIndex..<end], kind: .acronym)
121+
currentIndex = end
122+
currentStart = end
123+
continue
124+
}
107125
}
108126
}
109127

@@ -157,3 +175,22 @@ extension Character {
157175
@usableFromInline
158176
var isAlphanumeric: Bool { (isLetter || isNumber) }
159177
}
178+
179+
extension StringProtocol {
180+
@usableFromInline
181+
var lastIndex: Index? {
182+
count > 0 ? index(before: endIndex) : nil
183+
}
184+
185+
@usableFromInline
186+
func index(before other: Index, limitedBy limit: Index) -> Index? {
187+
guard other > limit else { return nil }
188+
return index(before: other)
189+
}
190+
191+
@usableFromInline
192+
func index(after other: Index, limitedBy limit: Index) -> Index? {
193+
guard other < limit else { return nil }
194+
return index(after: other)
195+
}
196+
}

Tests/CasificationTests/TokenizationTests.swift

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,101 @@ struct TokenizationTests {
5656
])
5757
}
5858

59+
@Test
60+
func tricky() async throws {
61+
#expect("Midlane"._tokenize() == [
62+
"Midlane".asToken(.word),
63+
])
64+
65+
#expect("MIDlane"._tokenize() == [
66+
"MIDlane".asToken(.word),
67+
])
68+
69+
#expect("MIDLane"._tokenize() == [
70+
"MIDLane".asToken(.word),
71+
])
72+
73+
#expect("MIDLANE"._tokenize() == [
74+
"MIDLANE".asToken(.word),
75+
])
76+
77+
#expect("identity"._tokenize() == [
78+
"identity".asToken(.word),
79+
])
80+
81+
#expect("IDENTITY"._tokenize() == [
82+
"IDENTITY".asToken(.word),
83+
])
84+
85+
#expect("UUIDstring"._tokenize() == [
86+
"UUID".asToken(.acronym),
87+
"".asToken(.separator),
88+
"string".asToken(.word),
89+
])
90+
91+
#expect("UUIDString"._tokenize() == [
92+
"UUID".asToken(.acronym),
93+
"".asToken(.separator),
94+
"String".asToken(.word),
95+
])
96+
97+
#expect("UUIDSTRING"._tokenize() == [
98+
"UUIDSTRING".asToken(.word),
99+
])
100+
101+
#expect("uuidstring"._tokenize() == [
102+
"uuidstring".asToken(.word)
103+
])
104+
105+
#expect("uuidString"._tokenize() == [
106+
"uuid".asToken(.acronym),
107+
"".asToken(.separator),
108+
"String".asToken(.word),
109+
])
110+
111+
#expect("uuidSTRING"._tokenize() == [
112+
"uuid".asToken(.acronym),
113+
"".asToken(.separator),
114+
"STRING".asToken(.word),
115+
])
116+
117+
#expect("UUIDjson"._tokenize() == [
118+
"UUID".asToken(.acronym),
119+
"".asToken(.separator),
120+
"json".asToken(.acronym),
121+
])
122+
123+
#expect("UUIDJson"._tokenize() == [
124+
"UUID".asToken(.acronym),
125+
"".asToken(.separator),
126+
"Json".asToken(.acronym),
127+
])
128+
129+
#expect("UUIDJSON"._tokenize() == [
130+
"UUID".asToken(.acronym),
131+
"".asToken(.separator),
132+
"JSON".asToken(.acronym),
133+
])
134+
135+
#expect("uuidjson"._tokenize() == [
136+
"uuid".asToken(.acronym),
137+
"".asToken(.separator),
138+
"json".asToken(.acronym),
139+
])
140+
141+
#expect("uuidJson"._tokenize() == [
142+
"uuid".asToken(.acronym),
143+
"".asToken(.separator),
144+
"Json".asToken(.acronym),
145+
])
146+
147+
#expect("uuidJSON"._tokenize() == [
148+
"uuid".asToken(.acronym),
149+
"".asToken(.separator),
150+
"JSON".asToken(.acronym),
151+
])
152+
}
153+
59154
@Test
60155
func withAcronyms() async throws {
61156
#expect("UUIDJSON"._tokenize() == [
@@ -119,6 +214,40 @@ struct TokenizationTests {
119214
"Uuid".asToken(.acronym),
120215
]
121216
)
217+
218+
#expect(
219+
"AaaaAaaa"._tokenize() == [
220+
"Aaaa".asToken(.word),
221+
"".asToken(.separator),
222+
"Aaaa".asToken(.word),
223+
]
224+
)
225+
226+
#expect("identity"._tokenize() == [
227+
"identity".asToken(.word),
228+
])
229+
230+
#expect(
231+
"grid1x1"._tokenize() == [
232+
"grid".asToken(.word),
233+
"".asToken(.separator),
234+
"1".asToken(.number),
235+
"".asToken(.separator),
236+
"x".asToken(.word),
237+
"".asToken(.separator),
238+
"1".asToken(.number),
239+
]
240+
)
241+
242+
#expect(
243+
"lens1x"._tokenize() == [
244+
"lens".asToken(.word),
245+
"".asToken(.separator),
246+
"1".asToken(.number),
247+
"".asToken(.separator),
248+
"x".asToken(.word),
249+
]
250+
)
122251
}
123252
}
124253

0 commit comments

Comments
 (0)