feat: Improve acronyms detection

maximkrouk · maximkrouk · commit b86d9cffced1 · 2026-02-21T22:41:11.000+01:00
diff --git a/Sources/Casification/Casification.swift b/Sources/Casification/Casification.swift
@@ -27,6 +27,7 @@ extension String {
 			"xml", "Xml", "XML",
 			"yaml", "Yaml", "YAML", // todo: add more extensions
 			"sf", "SF",
+			"ns", "NS",
 			"ui", "UI",
 			"ux", "UX",
 			"sk", "SK" // todo: add more system prefixes
@@ -41,7 +42,7 @@ extension String.Casification {
 		func tokenize(_ input: Substring) -> [Token]
 	}
 
-	public struct Token: Hashable {
+	public struct Token: Hashable, CustomStringConvertible {
 		public var value: Substring
 		public var kind: Kind
 
@@ -50,11 +51,24 @@ extension String.Casification {
 			self.kind = kind
 		}
 
-		public enum Kind: Hashable {
+		public enum Kind: Hashable, CustomStringConvertible {
 			case word
 			case number
 			case acronym
 			case separator
+
+			public var description: String {
+				switch self {
+				case .word: "word"
+				case .number: "number"
+				case .acronym: "acronym"
+				case .separator: "separator"
+				}
+			}
+		}
+
+		public var description: String {
+			return ".\(kind)(\"\(value)\")"
 		}
 	}
 }
diff --git a/Sources/Casification/Tokenization/DefaultTokenizer.swift b/Sources/Casification/Tokenization/DefaultTokenizer.swift
@@ -59,7 +59,7 @@ extension String.Casification.Tokenizers {
 				tokens.append(.init(value, kind: kind))
 			}
 
-			func findAcronym(at index: String.Index) -> Substring? {
+			func getAcronym(at index: String.Index) -> Substring? {
 				config.acronyms.first { acronym in
 					guard let end = input.index(index, offsetBy: acronym.count, limitedBy: input.endIndex)
 					else { return false }
@@ -87,7 +87,7 @@ extension String.Casification.Tokenizers {
 					}
 
 					// Recursively allow split if another acronym follows
-					return findAcronym(at: end) != nil
+					return getAcronym(at: end) != nil
 				}.map { acronym in
 					let end = input.index(index, offsetBy: acronym.count)
 					return input[index..<end]
@@ -96,14 +96,32 @@ extension String.Casification.Tokenizers {
 
 			while currentIndex < input.endIndex {
 				do { // match acronyms
-					if let acronym = findAcronym(at: currentIndex) {
-						commitToken(upTo: currentIndex)
+					if let acronym = getAcronym(at: currentIndex) {
+						var isSuffixOfOtherToken: Bool {
+							if currentStart == currentIndex { return false }
 
-						let end = input.index(currentIndex, offsetBy: acronym.count)
-						commitToken(input[currentIndex..<end], kind: .acronym)
-						currentIndex = end
-						currentStart = end
-						continue
+							let prevIdx = input.index(
+								before: currentIndex,
+								limitedBy: input.startIndex
+							)
+
+							guard let prevIdx else { return false }
+
+							let prevChar = input[prevIdx]
+							let currChar = input[currentIndex]
+
+							return prevChar.isUppercase || currChar.isLowercase
+						}
+
+						if !isSuffixOfOtherToken {
+							commitToken(upTo: currentIndex)
+
+							let end = input.index(currentIndex, offsetBy: acronym.count)
+							commitToken(input[currentIndex..<end], kind: .acronym)
+							currentIndex = end
+							currentStart = end
+							continue
+						}
 					}
 				}
 
@@ -157,3 +175,22 @@ extension Character {
 	@usableFromInline
 	var isAlphanumeric: Bool { (isLetter || isNumber) }
 }
+
+extension StringProtocol {
+	@usableFromInline
+	var lastIndex: Index? {
+		count > 0 ? index(before: endIndex) : nil
+	}
+
+	@usableFromInline
+	func index(before other: Index, limitedBy limit: Index) -> Index? {
+		guard other > limit else { return nil }
+		return index(before: other)
+	}
+
+	@usableFromInline
+	func index(after other: Index, limitedBy limit: Index) -> Index? {
+		guard other < limit else { return nil }
+		return index(after: other)
+	}
+}
diff --git a/Tests/CasificationTests/TokenizationTests.swift b/Tests/CasificationTests/TokenizationTests.swift
@@ -56,6 +56,101 @@ struct TokenizationTests {
 		])
 	}
 
+	@Test
+	func tricky() async throws {
+		#expect("Midlane"._tokenize() == [
+			"Midlane".asToken(.word),
+		])
+
+		#expect("MIDlane"._tokenize() == [
+			"MIDlane".asToken(.word),
+		])
+
+		#expect("MIDLane"._tokenize() == [
+			"MIDLane".asToken(.word),
+		])
+
+		#expect("MIDLANE"._tokenize() == [
+			"MIDLANE".asToken(.word),
+		])
+
+		#expect("identity"._tokenize() == [
+			"identity".asToken(.word),
+		])
+
+		#expect("IDENTITY"._tokenize() == [
+			"IDENTITY".asToken(.word),
+		])
+
+		#expect("UUIDstring"._tokenize() == [
+			"UUID".asToken(.acronym),
+			"".asToken(.separator),
+			"string".asToken(.word),
+		])
+
+		#expect("UUIDString"._tokenize() == [
+			"UUID".asToken(.acronym),
+			"".asToken(.separator),
+			"String".asToken(.word),
+		])
+
+		#expect("UUIDSTRING"._tokenize() == [
+			"UUIDSTRING".asToken(.word),
+		])
+
+		#expect("uuidstring"._tokenize() == [
+			"uuidstring".asToken(.word)
+		])
+
+		#expect("uuidString"._tokenize() == [
+			"uuid".asToken(.acronym),
+			"".asToken(.separator),
+			"String".asToken(.word),
+		])
+
+		#expect("uuidSTRING"._tokenize() == [
+			"uuid".asToken(.acronym),
+			"".asToken(.separator),
+			"STRING".asToken(.word),
+		])
+
+		#expect("UUIDjson"._tokenize() == [
+			"UUID".asToken(.acronym),
+			"".asToken(.separator),
+			"json".asToken(.acronym),
+		])
+
+		#expect("UUIDJson"._tokenize() == [
+			"UUID".asToken(.acronym),
+			"".asToken(.separator),
+			"Json".asToken(.acronym),
+		])
+
+		#expect("UUIDJSON"._tokenize() == [
+			"UUID".asToken(.acronym),
+			"".asToken(.separator),
+			"JSON".asToken(.acronym),
+		])
+
+		#expect("uuidjson"._tokenize() == [
+			"uuid".asToken(.acronym),
+			"".asToken(.separator),
+			"json".asToken(.acronym),
+		])
+
+		#expect("uuidJson"._tokenize() == [
+			"uuid".asToken(.acronym),
+			"".asToken(.separator),
+			"Json".asToken(.acronym),
+		])
+
+		#expect("uuidJSON"._tokenize() == [
+			"uuid".asToken(.acronym),
+			"".asToken(.separator),
+			"JSON".asToken(.acronym),
+		])
+	}
+
 	@Test
 	func withAcronyms() async throws {
 		#expect("UUIDJSON"._tokenize() == [
@@ -119,6 +214,40 @@ struct TokenizationTests {
 				"Uuid".asToken(.acronym),
 			]
 		)
+
+		#expect(
+			"AaaaAaaa"._tokenize() == [
+				"Aaaa".asToken(.word),
+				"".asToken(.separator),
+				"Aaaa".asToken(.word),
+			]
+		)
+
+		#expect("identity"._tokenize() == [
+			"identity".asToken(.word),
+		])
+
+		#expect(
+			"grid1x1"._tokenize() == [
+				"grid".asToken(.word),
+				"".asToken(.separator),
+				"1".asToken(.number),
+				"".asToken(.separator),
+				"x".asToken(.word),
+				"".asToken(.separator),
+				"1".asToken(.number),
+			]
+		)
+
+		#expect(
+			"lens1x"._tokenize() == [
+				"lens".asToken(.word),
+				"".asToken(.separator),
+				"1".asToken(.number),
+				"".asToken(.separator),
+				"x".asToken(.word),
+			]
+		)
 	}
 }