diff --git a/autofill/autofill-impl/src/main/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizer.kt b/autofill/autofill-impl/src/main/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizer.kt index c94e39d7c5fd..12f1afcc27ab 100644 --- a/autofill/autofill-impl/src/main/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizer.kt +++ b/autofill/autofill-impl/src/main/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizer.kt @@ -31,32 +31,39 @@ interface UrlUnicodeNormalizer { class UrlUnicodeNormalizerImpl @Inject constructor() : UrlUnicodeNormalizer { override fun normalizeAscii(url: String?): String? { - if (url == null) return null - - val originalScheme = url.scheme() ?: "" - val noScheme = url.removePrefix(originalScheme) - - val sb = StringBuilder() - val info = IDNA.Info() - IDNA.getUTS46Instance(IDNA.DEFAULT).nameToASCII(noScheme, sb, info) - if (info.hasErrors()) { - logcat { "Unable to convert to ASCII: $url" } - return url + return normalizeUrl(url) { hostname, sb, info -> + IDNA.getUTS46Instance(IDNA.DEFAULT).nameToASCII(hostname, sb, info) } - return "${originalScheme}$sb" } override fun normalizeUnicode(url: String?): String? { + return normalizeUrl(url) { hostname, sb, info -> + IDNA.getUTS46Instance(IDNA.DEFAULT).nameToUnicode(hostname, sb, info) + } + } + + private fun normalizeUrl( + url: String?, + idnaProcessor: (hostname: String, sb: StringBuilder, info: IDNA.Info) -> Unit, + ): String? { if (url == null) return null + val originalScheme = url.scheme() ?: "" + val noScheme = url.removePrefix(originalScheme) + + // Extract just the hostname/domain part for IDNA processing + val hostEndIndex = noScheme.indexOfFirst { it == '/' || it == '?' || it == '#' } + val hostname = if (hostEndIndex == -1) noScheme else noScheme.substring(0, hostEndIndex) + val pathAndQuery = if (hostEndIndex == -1) "" else noScheme.substring(hostEndIndex) + val sb = StringBuilder() val info = IDNA.Info() - IDNA.getUTS46Instance(IDNA.DEFAULT).nameToUnicode(url, sb, info) + idnaProcessor(hostname, sb, info) if (info.hasErrors()) { - logcat { "Unable to convert to unicode: $url" } + logcat { "Unable to convert hostname: $hostname" } return url } - return sb.toString() + return "${originalScheme}$sb$pathAndQuery" } } diff --git a/autofill/autofill-impl/src/test/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizerImplTest.kt b/autofill/autofill-impl/src/test/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizerImplTest.kt index b133cc7c16e6..d7a5fe1829f5 100644 --- a/autofill/autofill-impl/src/test/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizerImplTest.kt +++ b/autofill/autofill-impl/src/test/java/com/duckduckgo/autofill/impl/encoding/UrlUnicodeNormalizerImplTest.kt @@ -45,4 +45,86 @@ class UrlUnicodeNormalizerImplTest { fun whenNormalizingToUnicodeAndOnlyContainsAsciiThenThenInputAndOutputIdentical() { assertEquals("c.com", testee.normalizeUnicode("c.com")) } + + @Test + fun whenNormalizingToAsciiWithSchemesThenSchemePreserved() { + assertEquals("http://xn--7ca.com", testee.normalizeAscii("http://ç.com")) + assertEquals("https://xn--7ca.com", testee.normalizeAscii("https://ç.com")) + } + + @Test + fun whenNormalizingToAsciiWithUrlComponentsThenAllPreserved() { + assertEquals("http://xn--7ca.com/path", testee.normalizeAscii("http://ç.com/path")) + assertEquals("http://xn--7ca.com?query=value", testee.normalizeAscii("http://ç.com?query=value")) + assertEquals("http://xn--7ca.com#fragment", testee.normalizeAscii("http://ç.com#fragment")) + assertEquals("https://xn--7ca.com/deep/nested/path/file.html", testee.normalizeAscii("https://ç.com/deep/nested/path/file.html")) + assertEquals("https://xn--7ca.com/search?q=test&lang=en&page=1", testee.normalizeAscii("https://ç.com/search?q=test&lang=en&page=1")) + } + + @Test + fun whenNormalizingToAsciiWithComplexUrlThenAllComponentsPreserved() { + assertEquals("http://xn--7ca.com/path?query=value#fragment", testee.normalizeAscii("http://ç.com/path?query=value#fragment")) + } + + @Test + fun whenNormalizingToAsciiWithUrlContainingInvalidDomainCharactersThenProcessesCorrectly() { + // This URL contains characters that are invalid in domain names (/,?,=,&) in the path/query + // Old implementation: tries to pass entire path to IDNA, fails, returns original + // New implementation: processes only hostname, succeeds, preserves path/query + val input = "https://google.com/signin?continue=https%3A%2F%2Fpasswords.com&id=123" + val expected = "https://google.com/signin?continue=https%3A%2F%2Fpasswords.com&id=123" + assertEquals(expected, testee.normalizeAscii(input)) + } + + @Test + fun whenNormalizingToAsciiWithNoSchemeThenProcessedWithoutScheme() { + assertEquals("xn--7ca.com/path", testee.normalizeAscii("ç.com/path")) + } + + @Test + fun whenNormalizingToAsciiWithPortNumberThenPortPreserved() { + assertEquals("https://xn--7ca.com:8080/path", testee.normalizeAscii("https://ç.com:8080/path")) + } + + @Test + fun whenNormalizingToAsciiWithSubdomainThenSubdomainProcessed() { + assertEquals("https://xn--sb-xka.xn--dmain-jua.com", testee.normalizeAscii("https://süb.dömain.com")) + } + + @Test + fun whenNormalizingToAsciiWithNullInputThenReturnsNull() { + assertNull(testee.normalizeAscii(null)) + } + + @Test + fun whenNormalizingToUnicodeWithNullInputThenReturnsNull() { + assertNull(testee.normalizeUnicode(null)) + } + + @Test + fun whenNormalizingToAsciiWithEmptyStringThenReturnsEmptyString() { + assertEquals("", testee.normalizeAscii("")) + } + + @Test + fun whenNormalizingToUnicodeWithEmptyStringThenReturnsEmptyString() { + assertEquals("", testee.normalizeUnicode("")) + } + + @Test + fun whenNormalizingToUnicodeWithComplexUrlThenAllComponentsPreserved() { + assertEquals("https://ç.com/path?query=value#fragment", testee.normalizeUnicode("https://xn--7ca.com/path?query=value#fragment")) + } + + @Test + fun whenNormalizingToUnicodeWithSchemesThenSchemePreserved() { + assertEquals("http://ç.com", testee.normalizeUnicode("http://xn--7ca.com")) + assertEquals("https://ç.com", testee.normalizeUnicode("https://xn--7ca.com")) + } + + @Test + fun whenNormalizingToAsciiWithPortThenPortIncludedInHostname() { + // Current implementation includes port in hostname - verify this behavior + assertEquals("https://example.com:8080/path", testee.normalizeAscii("https://example.com:8080/path")) + } }