Merge pull request #8 from ellismg/normalize-unicode-literals

ellismg · ellismg · commit da3774695ef6 · 2015-01-07T15:58:33.000-08:00
Add a rule to enforce ASCII only literals
diff --git a/src/Microsoft.DotNet.CodeFormatting.Tests/Microsoft.DotNet.CodeFormatting.Tests.csproj b/src/Microsoft.DotNet.CodeFormatting.Tests/Microsoft.DotNet.CodeFormatting.Tests.csproj
@@ -112,6 +112,7 @@
     <Compile Include="Rules\HasNoNewLineAfterOpenBraceFormattingRuleTests.cs" />
     <Compile Include="Rules\HasNoNewLineBeforeEndBraceFormattingRuleTests.cs" />
     <Compile Include="Rules\HasUnderScoreInPrivateFieldNamesFormattingRuleTests.cs" />
+    <Compile Include="Rules\NonAsciiCharactersAreEscapedInLiteralsRuleTests.cs" />
     <Compile Include="Rules\UsesXunitForTestsFormattingRuleTests.cs" />
   </ItemGroup>
   <ItemGroup>
diff --git a/src/Microsoft.DotNet.CodeFormatting.Tests/Rules/NonAsciiCharactersAreEscapedInLiteralsRuleTests.cs b/src/Microsoft.DotNet.CodeFormatting.Tests/Rules/NonAsciiCharactersAreEscapedInLiteralsRuleTests.cs
@@ -0,0 +1,56 @@
+﻿// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+using Xunit;
+
+namespace Microsoft.DotNet.CodeFormatting.Tests
+{
+    public class NonAsciiChractersAreEscapedInLiteralsTests : CodeFormattingTestBase
+    {
+        [Fact]
+        public void CanUseNonAsciiCharactersInComments()
+        {
+            var text = string.Format(@"
+// It's oaky to use non ASCII characters like {0} (CHECK MARK U+2713) or {1} (RAINBOW U+1F308) in comments.
+/*
+It's oaky to use non ASCII characters like {0} (CHECK MARK U+2713) or {1} (RAINBOW U+1F308) in comments.
+*/
+", '\u2713', "\U0001F308");
+            var expected = text;
+
+            Verify(text, expected);
+        }
+
+        [Fact]
+        public void DoNotAllowUnicodeInLiterals()
+        {
+            var text = string.Format(@"
+using System;
+
+class Test
+{{
+    public static readonly string BadString = ""This has {0} and {1}, which are both bad."";
+    public static readonly string AnotherBadString = @""This has {0} and {1}, which are both bad."";
+    public const char BadChar = '{0}';
+}}
+", '\u2713', "\U0001F308");
+
+            var expected = @"
+using System;
+
+class Test
+{
+    public static readonly string BadString = ""This has \u2713 and \U0001F308, which are both bad."";
+    public static readonly string AnotherBadString = @""This has \u2713 and \U0001F308, which are both bad."";
+    public const char BadChar = '\u2713';
+}
+";
+            Verify(text, expected);
+        }
+
+        internal override IFormattingRule GetFormattingRule()
+        {
+            return new Rules.NonAsciiChractersAreEscapedInLiterals();
+        }
+    }
+}
diff --git a/src/Microsoft.DotNet.CodeFormatting/Microsoft.DotNet.CodeFormatting.csproj b/src/Microsoft.DotNet.CodeFormatting/Microsoft.DotNet.CodeFormatting.csproj
@@ -99,6 +99,7 @@
     <Compile Include="Rules\HasUsingsOutsideOfNamespaceFormattingRule.cs" />
     <Compile Include="Rules\IsFormattedFormattingRule.cs" />
     <Compile Include="Rules\IsSimplifiedFormattingRule.cs" />
+    <Compile Include="Rules\NonAsciiCharactersAreEscapedInLiteralsRule.cs" />
     <Compile Include="Rules\RuleExtensions.cs" />
     <Compile Include="Rules\UsesXunitForTestsFormattingRule.cs" />
     <Compile Include="RuleTypeConstants.cs" />
diff --git a/src/Microsoft.DotNet.CodeFormatting/Rules/NonAsciiCharactersAreEscapedInLiteralsRule.cs b/src/Microsoft.DotNet.CodeFormatting/Rules/NonAsciiCharactersAreEscapedInLiteralsRule.cs
@@ -0,0 +1,126 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under MIT. See LICENSE in the project root for license information.
+using System;
+using System.ComponentModel.Composition;
+using System.Diagnostics;
+using System.Text;
+using System.Threading;
+using System.Threading.Tasks;
+
+using Microsoft.CodeAnalysis;
+using Microsoft.CodeAnalysis.CSharp;
+using Microsoft.CodeAnalysis.CSharp.Syntax;
+
+namespace Microsoft.DotNet.CodeFormatting.Rules
+{
+    internal sealed class NonAsciiChractersAreEscapedInLiterals : IFormattingRule
+    {
+        public async Task<Document> ProcessAsync(Document document, CancellationToken cancellationToken)
+        {
+            var root = await document.GetSyntaxRootAsync(cancellationToken) as CompilationUnitSyntax;
+
+            if (root == null)
+                return document;
+
+            var newRoot = UnicodeCharacterEscapingSyntaxRewriter.Rewriter.Visit(root);
+
+            return document.WithSyntaxRoot(newRoot);
+        }
+
+        /// <summary>
+        ///  Rewrites string and character literals which contain non ascii characters to instead use the \uXXXX or \UXXXXXXXX syntax.
+        /// </summary>
+        internal class UnicodeCharacterEscapingSyntaxRewriter : CSharpSyntaxRewriter
+        {
+            public static readonly UnicodeCharacterEscapingSyntaxRewriter Rewriter = new UnicodeCharacterEscapingSyntaxRewriter();
+
+            private UnicodeCharacterEscapingSyntaxRewriter()
+            {
+            }
+
+            public override SyntaxNode VisitLiteralExpression(LiteralExpressionSyntax node)
+            {
+                switch (node.CSharpKind())
+                {
+                    case SyntaxKind.StringLiteralExpression:
+                        return RewriteStringLiteralExpression(node);
+                    case SyntaxKind.CharacterLiteralExpression:
+                        return RewriteCharacterLiteralExpression(node);
+                }
+
+                return base.Visit(node);                
+            }
+
+            private static SyntaxNode RewriteStringLiteralExpression(LiteralExpressionSyntax node)
+            {
+                Debug.Assert(node.CSharpKind() == SyntaxKind.StringLiteralExpression);
+
+                if (HasNonAsciiCharacters(node.Token.Text))
+                {
+                    string convertedText = EscapeNonAsciiCharacters(node.Token.Text);
+
+                    SyntaxToken t = SyntaxFactory.Literal(node.Token.LeadingTrivia, convertedText, node.Token.ValueText, node.Token.TrailingTrivia);
+
+                    node = node.WithToken(t);
+                }
+
+                return node;
+            }
+
+            private static SyntaxNode RewriteCharacterLiteralExpression(LiteralExpressionSyntax node)
+            {
+                Debug.Assert(node.CSharpKind() == SyntaxKind.CharacterLiteralExpression);
+
+                if (HasNonAsciiCharacters(node.Token.Text))
+                {
+                    string convertedText = EscapeNonAsciiCharacters(node.Token.Text);
+
+                    SyntaxToken t = SyntaxFactory.Literal(node.Token.LeadingTrivia, convertedText, node.Token.ValueText, node.Token.TrailingTrivia);
+
+                    node = node.WithToken(t);
+                }
+
+                return node;
+            }
+
+
+            private static bool HasNonAsciiCharacters(string value)
+            {
+                for (int i = 0; i < value.Length; i++)
+                {
+                    if (value[i] >= 0x80)
+                    {
+                        return true;
+                    }
+                }
+
+                return false;
+            }
+
+            private static string EscapeNonAsciiCharacters(string oldValue)
+            {
+                StringBuilder sb = new StringBuilder(oldValue.Length);
+
+                for (int i = 0; i < oldValue.Length; i++)
+                {
+                    if (oldValue[i] < 0x80)
+                    {
+                        sb.Append(oldValue[i]);
+                    }
+                    else if (char.IsHighSurrogate(oldValue[i]) && i + 1 < oldValue.Length && char.IsLowSurrogate(oldValue[i + 1]))
+                    {
+                        sb.Append(string.Format(@"\U{0:X8}", char.ConvertToUtf32(oldValue[i], oldValue[i + 1])));
+                        i++; // move past the low surogate we consumed above.
+                    }
+                    else
+                    {
+                        sb.Append(string.Format(@"\u{0:X4}", (ushort)oldValue[i]));
+                    }
+                }
+
+                return sb.ToString();
+            }
+
+        }
+    }
+}