Starting out language annotation support implementation

dyegoaurelio · dyegoaurelio · commit 461ff10ca429 · 2025-09-23T19:44:14.000-03:00
Initial support for language annotations like `/* lua */`
that should remain as block comments when directly preceding string literals,
while other block comments get converted to line comments.

- Detect language annotations: single-line, non-doc comments with valid language identifiers
- Preserve as `/* lang */` block comment syntax instead of converting to `# lang` line comments
- Works with both regular strings `"..."` and indented strings `''...''`
diff --git a/src/Nixfmt/Lexer.hs b/src/Nixfmt/Lexer.hs
@@ -6,11 +6,12 @@
 module Nixfmt.Lexer (lexeme, pushTrivia, takeTrivia, whole) where
 
 import Control.Monad.State.Strict (MonadState, evalStateT, get, modify, put)
-import Data.Char (isSpace)
+import Data.Char (isAlphaNum, isSpace)
 import Data.List (dropWhileEnd)
 import Data.Maybe (fromMaybe)
 import Data.Text as Text (
   Text,
+  all,
   isPrefixOf,
   length,
   lines,
@@ -29,6 +30,7 @@ import Data.Void (Void)
 import Nixfmt.Types (
   Ann (..),
   Parser,
+  Token (TDoubleQuote, TDoubleSingleQuote),
   TrailingComment (..),
   Trivia,
   Trivium (..),
@@ -43,9 +45,11 @@ import Text.Megaparsec (
   chunk,
   getSourcePos,
   hidden,
+  lookAhead,
   many,
   manyTill,
   notFollowedBy,
+  optional,
   some,
   try,
   unPos,
@@ -59,6 +63,8 @@ data ParseTrivium
     PTLineComment Text Pos
   | -- Track whether it is a doc comment
     PTBlockComment Bool [Text]
+  | -- | Language annotation like /* lua */ (single line, non-doc)
+    PTLanguageAnnotation Text
   deriving (Show)
 
 preLexeme :: Parser a -> Parser a
@@ -148,6 +154,7 @@ convertLeading =
         PTBlockComment _ [] -> []
         PTBlockComment False [c] -> [LineComment $ " " <> strip c]
         PTBlockComment isDoc cs -> [BlockComment isDoc cs]
+        PTLanguageAnnotation c -> [LanguageAnnotation c]
     )
 
 isTrailing :: ParseTrivium -> Bool
@@ -156,17 +163,93 @@ isTrailing (PTBlockComment False []) = True
 isTrailing (PTBlockComment False [_]) = True
 isTrailing _ = False
 
-convertTrivia :: [ParseTrivium] -> Pos -> (Maybe TrailingComment, Trivia)
-convertTrivia pts nextCol =
+-- Check if a text is a valid language identifier for language annotations
+isLanguageIdentifier :: Text -> Bool
+isLanguageIdentifier content =
+  let stripped = strip content
+  in not (Text.null stripped)
+      && Text.length stripped <= 30 -- TODO: make configurable or remove limit
+      && Text.all (\c -> isAlphaNum c || c `elem` ['-', '+', '.', '_', '$', '{', '}']) stripped
+
+-- Check if next token is a string literal
+isStringToken :: Maybe Token -> Bool
+isStringToken (Just TDoubleQuote) = True
+isStringToken (Just TDoubleSingleQuote) = True
+isStringToken _ = False
+
+-- Convert a single block comment to language annotation if it matches criteria
+toLangAnnotation :: Text -> Maybe Token -> Maybe ParseTrivium
+toLangAnnotation content nextToken
+  | isStringToken nextToken && isLanguageIdentifier content =
+      Just (PTLanguageAnnotation (strip content))
+  | otherwise = Nothing
+
+convertTrivia :: [ParseTrivium] -> Pos -> Maybe Token -> (Maybe TrailingComment, Trivia)
+convertTrivia pts nextCol nextToken =
   let (trailing, leading) = span isTrailing pts
-  in case (trailing, leading) of
+      (trailing', leading') = processTrailing trailing leading
+      leading'' = case trailing' of
+        [] | not (Prelude.null trailing) -> leading' -- trailing was converted, don't process leading
+        _ -> processLeading leading' -- process leading normally
+  in case (trailing', leading'') of
       -- Special case: if the trailing comment visually forms a block with the start of the following line,
       -- then treat it like part of those comments instead of a distinct trailing comment.
       -- This happens especially often after `{` or `[` tokens, where the comment of the first item
       -- starts on the same line ase the opening token.
-      ([PTLineComment _ pos], (PTNewlines 1) : (PTLineComment _ pos') : _) | pos == pos' -> (Nothing, convertLeading pts)
-      ([PTLineComment _ pos], [PTNewlines 1]) | pos == nextCol -> (Nothing, convertLeading pts)
-      _ -> (convertTrailing trailing, convertLeading leading)
+      ([PTLineComment _ pos], (PTNewlines 1) : (PTLineComment _ pos') : _)
+        | pos == pos' -> (Nothing, convertLeading pts)
+      ([PTLineComment _ pos], [PTNewlines 1])
+        | pos == nextCol -> (Nothing, convertLeading pts)
+      _ -> (convertTrailing trailing', convertLeading leading'')
+  where
+    hasLineComment = Prelude.any (\case PTLineComment{} -> True; _ -> False)
+    hasLangCandidate = Prelude.any (\case PTBlockComment False [c] -> isLanguageIdentifier c; _ -> False)
+
+    -- Convert the rightmost qualifying block comment to language annotation
+    convertLastBlockToLang triviaList = go (reverse triviaList) []
+      where
+        go [] _ = Nothing
+        go (PTBlockComment False [content] : rest) processed
+          | Just langAnnotation <- toLangAnnotation content nextToken =
+              Just (reverse rest ++ processed, langAnnotation)
+        go (t : rest) processed = go rest (t : processed)
+
+    -- Process trailing trivia for language annotations
+    processTrailing trailing leading
+      -- Single trailing block comment before string
+      | [PTBlockComment False [content]] <- trailing,
+        Just langAnnotation <- toLangAnnotation content nextToken =
+          ([], langAnnotation : leading)
+      -- Multiple trailing comments before string (no line comments)
+      | isStringToken nextToken && not (hasLineComment trailing) && hasLangCandidate trailing,
+        Just (newTrailing, langAnnotation) <- convertLastBlockToLang trailing =
+          (newTrailing, langAnnotation : leading)
+      -- No conversion needed
+      | otherwise = (trailing, leading)
+
+    -- Process leading trivia for language annotations
+    processLeading leading
+      -- First item is convertible block comment
+      | PTBlockComment False [content] : rest <- leading,
+        Just langAnnotation <- toLangAnnotation content nextToken =
+          langAnnotation : rest
+      -- Find convertible comment deeper in the list
+      | Just (newLeading, langAnnotation) <- convertLastBlockToLang leading =
+          newLeading ++ [langAnnotation]
+      -- No conversion needed
+      | otherwise = leading
+
+-- Parser to peek at the next token type without consuming input
+parseNextTokenType :: Parser Token
+parseNextTokenType = do
+  -- Skip any trivia that might appear before the next token
+  _ <- many (hidden $ lineComment <|> blockComment <|> newlines)
+  -- Skip any remaining whitespace
+  _ <- manyP (\x -> isSpace x && x /= '\n' && x /= '\r')
+  TDoubleQuote
+    <$ chunk "\""
+      <|> TDoubleSingleQuote
+    <$ chunk "''"
 
 trivia :: Parser [ParseTrivium]
 trivia = many $ hidden $ lineComment <|> blockComment <|> newlines
@@ -188,7 +271,11 @@ lexeme p = do
   parsedTrivia <- trivia
   -- This is the position of the next lexeme after the currently parsed one
   SourcePos{sourceColumn = col} <- getSourcePos
-  let (trailing, nextLeading) = convertTrivia parsedTrivia col
+
+  -- Add lookahead for next token
+  nextToken <- optional (try $ lookAhead $ preLexeme parseNextTokenType)
+
+  let (trailing, nextLeading) = convertTrivia parsedTrivia col nextToken
   pushTrivia nextLeading
   return $
     Ann
diff --git a/src/Nixfmt/Pretty.hs b/src/Nixfmt/Pretty.hs
@@ -86,6 +86,7 @@ instance Pretty TrailingComment where
 instance Pretty Trivium where
   pretty EmptyLine = emptyline
   pretty (LineComment c) = comment ("#" <> c) <> hardline
+  pretty (LanguageAnnotation lang) = comment ("/* " <> lang <> " */") <> hardspace
   pretty (BlockComment isDoc c) =
     comment (if isDoc then "/**" else "/*")
       <> hardline
@@ -109,6 +110,8 @@ prettyItems (Items items) = sepBy hardline items
 
 instance Pretty [Trivium] where
   pretty [] = mempty
+  -- Special case: if trivia consists only of a single language annotation, render it inline without a preceding hardline
+  pretty [langAnnotation@(LanguageAnnotation _)] = pretty langAnnotation
   pretty trivia = hardline <> hcat trivia
 
 instance (Pretty a) => Pretty (Ann a) where
diff --git a/src/Nixfmt/Types.hs b/src/Nixfmt/Types.hs
@@ -72,6 +72,8 @@ data Trivium
   | -- Multi-line comments with /* or /**. Multiple # comments are treated as a list of `LineComment`.
     -- The bool indicates a doc comment (/**)
     BlockComment Bool [Text]
+  | -- | Language annotation comments like /* lua */ that should remain as block comments before strings
+    LanguageAnnotation Text
   deriving (Eq, Show)
 
 type Trivia = [Trivium]
diff --git a/test/diff/language-annotation/out-pure.nix b/test/diff/language-annotation/out-pure.nix
@@ -42,8 +42,8 @@
   ";
 
   # Multiple block comments in sequence
-  sequentialComments = # first second
-    ''
+  sequentialComments = # first
+    /* second */ ''
       some content
     '';
 
diff --git a/test/diff/language-annotation/out.nix b/test/diff/language-annotation/out.nix
@@ -42,8 +42,8 @@
   ";
 
   # Multiple block comments in sequence
-  sequentialComments = # first second
-    ''
+  sequentialComments = # first
+    /* second */ ''
       some content
     '';