Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,39 @@ interface UrlUnicodeNormalizer {
class UrlUnicodeNormalizerImpl @Inject constructor() : UrlUnicodeNormalizer {

override fun normalizeAscii(url: String?): String? {
if (url == null) return null

val originalScheme = url.scheme() ?: ""
val noScheme = url.removePrefix(originalScheme)

val sb = StringBuilder()
val info = IDNA.Info()
IDNA.getUTS46Instance(IDNA.DEFAULT).nameToASCII(noScheme, sb, info)
if (info.hasErrors()) {
logcat { "Unable to convert to ASCII: $url" }
return url
return normalizeUrl(url) { hostname, sb, info ->
IDNA.getUTS46Instance(IDNA.DEFAULT).nameToASCII(hostname, sb, info)
}
return "${originalScheme}$sb"
}

override fun normalizeUnicode(url: String?): String? {
return normalizeUrl(url) { hostname, sb, info ->
IDNA.getUTS46Instance(IDNA.DEFAULT).nameToUnicode(hostname, sb, info)
}
}

private fun normalizeUrl(
url: String?,
idnaProcessor: (hostname: String, sb: StringBuilder, info: IDNA.Info) -> Unit,
): String? {
if (url == null) return null

val originalScheme = url.scheme() ?: ""
val noScheme = url.removePrefix(originalScheme)

// Extract just the hostname/domain part for IDNA processing
val hostEndIndex = noScheme.indexOfFirst { it == '/' || it == '?' || it == '#' }
val hostname = if (hostEndIndex == -1) noScheme else noScheme.substring(0, hostEndIndex)
val pathAndQuery = if (hostEndIndex == -1) "" else noScheme.substring(hostEndIndex)

val sb = StringBuilder()
val info = IDNA.Info()
IDNA.getUTS46Instance(IDNA.DEFAULT).nameToUnicode(url, sb, info)
idnaProcessor(hostname, sb, info)
if (info.hasErrors()) {
logcat { "Unable to convert to unicode: $url" }
logcat { "Unable to convert hostname: $hostname" }
return url
}
return sb.toString()
return "${originalScheme}$sb$pathAndQuery"
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,86 @@ class UrlUnicodeNormalizerImplTest {
fun whenNormalizingToUnicodeAndOnlyContainsAsciiThenThenInputAndOutputIdentical() {
assertEquals("c.com", testee.normalizeUnicode("c.com"))
}

@Test
fun whenNormalizingToAsciiWithSchemesThenSchemePreserved() {
assertEquals("http://xn--7ca.com", testee.normalizeAscii("http://ç.com"))
assertEquals("https://xn--7ca.com", testee.normalizeAscii("https://ç.com"))
}

@Test
fun whenNormalizingToAsciiWithUrlComponentsThenAllPreserved() {
assertEquals("http://xn--7ca.com/path", testee.normalizeAscii("http://ç.com/path"))
assertEquals("http://xn--7ca.com?query=value", testee.normalizeAscii("http://ç.com?query=value"))
assertEquals("http://xn--7ca.com#fragment", testee.normalizeAscii("http://ç.com#fragment"))
assertEquals("https://xn--7ca.com/deep/nested/path/file.html", testee.normalizeAscii("https://ç.com/deep/nested/path/file.html"))
assertEquals("https://xn--7ca.com/search?q=test&lang=en&page=1", testee.normalizeAscii("https://ç.com/search?q=test&lang=en&page=1"))
}

@Test
fun whenNormalizingToAsciiWithComplexUrlThenAllComponentsPreserved() {
assertEquals("http://xn--7ca.com/path?query=value#fragment", testee.normalizeAscii("http://ç.com/path?query=value#fragment"))
}

@Test
fun whenNormalizingToAsciiWithUrlContainingInvalidDomainCharactersThenProcessesCorrectly() {
// This URL contains characters that are invalid in domain names (/,?,=,&) in the path/query
// Old implementation: tries to pass entire path to IDNA, fails, returns original
// New implementation: processes only hostname, succeeds, preserves path/query
val input = "https://google.com/signin?continue=https%3A%2F%2Fpasswords.com&id=123"
val expected = "https://google.com/signin?continue=https%3A%2F%2Fpasswords.com&id=123"
assertEquals(expected, testee.normalizeAscii(input))
}

@Test
fun whenNormalizingToAsciiWithNoSchemeThenProcessedWithoutScheme() {
assertEquals("xn--7ca.com/path", testee.normalizeAscii("ç.com/path"))
}

@Test
fun whenNormalizingToAsciiWithPortNumberThenPortPreserved() {
assertEquals("https://xn--7ca.com:8080/path", testee.normalizeAscii("https://ç.com:8080/path"))
}

@Test
fun whenNormalizingToAsciiWithSubdomainThenSubdomainProcessed() {
assertEquals("https://xn--sb-xka.xn--dmain-jua.com", testee.normalizeAscii("https://süb.dömain.com"))
}

@Test
fun whenNormalizingToAsciiWithNullInputThenReturnsNull() {
assertNull(testee.normalizeAscii(null))
}

@Test
fun whenNormalizingToUnicodeWithNullInputThenReturnsNull() {
assertNull(testee.normalizeUnicode(null))
}

@Test
fun whenNormalizingToAsciiWithEmptyStringThenReturnsEmptyString() {
assertEquals("", testee.normalizeAscii(""))
}

@Test
fun whenNormalizingToUnicodeWithEmptyStringThenReturnsEmptyString() {
assertEquals("", testee.normalizeUnicode(""))
}

@Test
fun whenNormalizingToUnicodeWithComplexUrlThenAllComponentsPreserved() {
assertEquals("https://ç.com/path?query=value#fragment", testee.normalizeUnicode("https://xn--7ca.com/path?query=value#fragment"))
}

@Test
fun whenNormalizingToUnicodeWithSchemesThenSchemePreserved() {
assertEquals("http://ç.com", testee.normalizeUnicode("http://xn--7ca.com"))
assertEquals("https://ç.com", testee.normalizeUnicode("https://xn--7ca.com"))
}

@Test
fun whenNormalizingToAsciiWithPortThenPortIncludedInHostname() {
// Current implementation includes port in hostname - verify this behavior
assertEquals("https://example.com:8080/path", testee.normalizeAscii("https://example.com:8080/path"))
}
}
Loading