Skip to content
Open
Show file tree
Hide file tree
Changes from 23 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
394 changes: 394 additions & 0 deletions clang-tools-extra/clang-tidy/tool/check_alphabetical_order.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
#!/usr/bin/env python3
#
# ===-----------------------------------------------------------------------===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===-----------------------------------------------------------------------===#

"""

Clang-Tidy Alphabetical Order Checker
=====================================

Normalize Clang-Tidy documentation with deterministic sorting for linting/tests.

Behavior:
- Sort entries in docs/clang-tidy/checks/list.rst csv-table.
- Sort key sections in docs/ReleaseNotes.rst.
- Detect duplicated entries in 'Changes in existing checks'.

Flags:
-o/--output Write normalized content to this path instead of updating docs.
"""

import argparse
import io
import os
import re
import sys
from typing import Dict, List, Optional, Sequence, Tuple, Union, overload
from operator import itemgetter

# Matches a :doc:`label <path>` or :doc:`label` reference anywhere in text and
# captures the label. Used to sort bullet items alphabetically in ReleaseNotes
# items by their label.
DOC_LABEL_RN_RE = re.compile(r":doc:`(?P<label>[^`<]+)\s*(?:<[^>]+>)?`")

# Matches a single csv-table row line in list.rst that begins with a :doc:
# reference, capturing the label. Used to extract the sort key per row.
DOC_LINE_RE = re.compile(r"^\s*:doc:`(?P<label>[^`<]+?)\s*<[^>]+>`.*$")


EXTRA_DIR = os.path.join(os.path.dirname(__file__), "../..")
DOCS_DIR = os.path.join(EXTRA_DIR, "docs")
CLANG_TIDY_DOCS_DIR = os.path.join(DOCS_DIR, "clang-tidy")
CHECKS_DOCS_DIR = os.path.join(CLANG_TIDY_DOCS_DIR, "checks")
LIST_DOC = os.path.join(CHECKS_DOCS_DIR, "list.rst")
RELEASE_NOTES_DOC = os.path.join(DOCS_DIR, "ReleaseNotes.rst")


def read_text(path: str) -> List[str]:
with io.open(path, "r", encoding="utf-8") as f:
return f.read().splitlines(True)


def write_text(path: str, content: str) -> None:
with io.open(path, "w", encoding="utf-8", newline="") as f:
f.write(content)


def _normalize_list_rst_lines(lines: Sequence[str]) -> List[str]:
"""Return normalized content of checks list.rst as a list of lines."""
out: List[str] = []
i = 0
n = len(lines)

def key_for(line: str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"key" meaning check name? Let's call it check_name for better context

m = DOC_LINE_RE.match(line)
if not m:
return (1, "")
return (0, m.group("label"))

while i < n:
line = lines[i]
if line.lstrip().startswith(".. csv-table::"):
out.append(line)
i += 1

while i < n and (lines[i].startswith(" ") or lines[i].strip() == ""):
if DOC_LINE_RE.match(lines[i]):
break
out.append(lines[i])
i += 1

entries: List[str] = []
while i < n and lines[i].startswith(" "):
entries.append(lines[i])
i += 1

entries_sorted = sorted(entries, key=key_for)
out.extend(entries_sorted)
continue

out.append(line)
i += 1

return out


@overload
def normalize_list_rst(data: str) -> str:
...


@overload
def normalize_list_rst(data: List[str]) -> List[str]:
...


def normalize_list_rst(data: Union[str, List[str]]) -> Union[str, List[str]]:
"""Normalize list.rst; returns same type as input (str or list).

- If given a string, returns a single normalized string.
- If given a sequence of lines, returns a list of lines.
"""
if isinstance(data, str):
lines = data.splitlines(True)
return "".join(_normalize_list_rst_lines(lines))
else:
return _normalize_list_rst_lines(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to support both str and List[str]? Can we stick to only one way of processing and use it everywhere. IMO, there is no point in writing complex code.

If we really need to pass raw str, then just do data.splitlines(True) on caller side, and we don't have to write 20 lines of additional code here.



def find_heading(lines: Sequence[str], title: str) -> Optional[int]:
"""Find heading start index for a section underlined with ^ characters.

The function looks for a line equal to `title` followed by a line that
consists solely of ^, which matches the ReleaseNotes style for subsection
headings used here.

Returns index of the title line, or None if not found.
"""
for i in range(len(lines) - 1):
if lines[i].rstrip("\n") == title:
underline = lines[i + 1].rstrip("\n")
if underline and set(underline) == {"^"} and len(underline) >= len(title):
return i
return None


def extract_label(text: str) -> str:
m = DOC_LABEL_RN_RE.search(text)
return m.group("label") if m else text


def is_bullet_start(line: str) -> bool:
return line.startswith("- ")


def parse_bullet_blocks(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is "private" function, start with _

lines: Sequence[str], start: int, end: int
) -> Tuple[List[str], List[Tuple[str, List[str]]], List[str]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By looking at Tuple[List[str], List[Tuple[str, List[str]]], List[str]] I have no idea what to expect as an output. Can we give a comment what each part of the tuple means. Would be good to give proper name for each Tuple part with type alias.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those alias types could be reused later too

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NamedTuple or @dataclass are better ways to express such things.

Copy link
Contributor

@vbvictor vbvictor Nov 18, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm up to it.
I'm kinda bad at python TBH:(

i = start
n = end
first_bullet = i
while first_bullet < n and not is_bullet_start(lines[first_bullet]):
first_bullet += 1
prefix = list(lines[i:first_bullet])

blocks: List[Tuple[str, List[str]]] = []
i = first_bullet
while i < n:
if not is_bullet_start(lines[i]):
break
bstart = i
i += 1
while i < n and not is_bullet_start(lines[i]):
if (
i + 1 < n
and set(lines[i + 1].rstrip("\n")) == {"^"}
and lines[i].strip()
):
break
i += 1
block = list(lines[bstart:i])
key = extract_label(block[0])
blocks.append((key, block))

suffix = list(lines[i:n])
return prefix, blocks, suffix


def sort_blocks(blocks: List[Tuple[str, List[str]]]) -> List[List[str]]:
"""Return blocks sorted deterministically by their extracted label.

Duplicates are preserved; merging is left to authors to handle manually.
"""
return list(map(itemgetter(1), sorted(blocks, key=itemgetter(0))))


def find_duplicate_entries(
lines: Sequence[str], title: str
) -> List[Tuple[str, List[Tuple[int, List[str]]]]]:
"""Return detailed duplicate info as (key, [(start_idx, block_lines), ...]).

start_idx is the 0-based index of the first line of the bullet block in
the original lines list. Only keys with more than one occurrence are
returned, and occurrences are listed in the order they appear.
"""
bounds = _find_section_bounds(lines, title, None)
if bounds is None:
return []
_, sec_start, sec_end = bounds

i = sec_start
n = sec_end

while i < n and not is_bullet_start(lines[i]):
i += 1

blocks_with_pos: List[Tuple[str, int, List[str]]] = []
while i < n:
if not is_bullet_start(lines[i]):
break
bstart = i
i += 1
while i < n and not is_bullet_start(lines[i]):
if (
i + 1 < n
and set(lines[i + 1].rstrip("\n")) == {"^"}
and lines[i].strip()
):
break
i += 1
block = list(lines[bstart:i])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This block of code looks identical to one in parse_bullet_blocks, could we refactor it to separate function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or at least some part of it

key = extract_label(block[0])
blocks_with_pos.append((key, bstart, block))

grouped: Dict[str, List[Tuple[int, List[str]]]] = {}
for key, start, block in blocks_with_pos:
grouped.setdefault(key, []).append((start, block))

result: List[Tuple[str, List[Tuple[int, List[str]]]]] = []
for key, occs in grouped.items():
if len(occs) > 1:
result.append((key, occs))

result.sort(key=itemgetter(0))
return result


def _find_section_bounds(
lines: Sequence[str], title: str, next_title: Optional[str]
) -> Optional[Tuple[int, int, int]]:
"""Return (h_start, sec_start, sec_end) for section `title`.

- h_start: index of the section title line
- sec_start: index of the first content line after underline
- sec_end: index of the first line of the next section title (or end)
"""
h_start = find_heading(lines, title)
if h_start is None:
return None

sec_start = h_start + 2

# Determine end of section either from next_title or by scanning.
if next_title is not None:
h_end = find_heading(lines, next_title)
if h_end is None:
# Scan forward to the next heading-like underline.
h_end = sec_start
while h_end + 1 < len(lines):
if lines[h_end].strip() and set(lines[h_end + 1].rstrip("\n")) == {"^"}:
break
h_end += 1
sec_end = h_end
else:
# Scan to end or until a heading underline is found.
h_end = sec_start
while h_end + 1 < len(lines):
if lines[h_end].strip() and set(lines[h_end + 1].rstrip("\n")) == {"^"}:
break
h_end += 1
sec_end = h_end

return h_start, sec_start, sec_end


def _normalize_release_notes_section(
lines: Sequence[str], title: str, next_title: Optional[str]
) -> List[str]:
"""Normalize a single release-notes section and return updated lines."""
bounds = _find_section_bounds(lines, title, next_title)
if bounds is None:
return list(lines)
_, sec_start, sec_end = bounds

prefix, blocks, suffix = parse_bullet_blocks(lines, sec_start, sec_end)
sorted_blocks = sort_blocks(blocks)

new_section: List[str] = []
new_section.extend(prefix)
for i_b, b in enumerate(sorted_blocks):
if i_b > 0 and (
not new_section or (new_section and new_section[-1].strip() != "")
):
new_section.append("\n")
new_section.extend(b)
new_section.extend(suffix)

return list(lines[:sec_start]) + new_section + list(lines[sec_end:])


def normalize_release_notes(lines: Sequence[str]) -> str:
sections = ["New checks", "New check aliases", "Changes in existing checks"]

out = list(lines)

for idx in range(len(sections) - 1, -1, -1):
title = sections[idx]
next_title = sections[idx + 1] if idx + 1 < len(sections) else None
out = _normalize_release_notes_section(out, title, next_title)

return "".join(out)


def _emit_duplicate_report(lines: Sequence[str], title: str) -> Optional[str]:
dups_detail = find_duplicate_entries(lines, title)
if not dups_detail:
return None
out: List[str] = []
out.append(f"Error: Duplicate entries in '{title}':\n")
for key, occs in dups_detail:
out.append(f"\n-- Duplicate: {key}\n")
for start_idx, block in occs:
out.append(f"- At line {start_idx + 1}:\n")
out.append("".join(block))
if not (block and block[-1].endswith("\n")):
out.append("\n")
return "".join(out)


def process_release_notes(out_path: str, rn_doc: str) -> int:
lines = read_text(rn_doc)
normalized = normalize_release_notes(lines)
write_text(out_path, normalized)

# Prefer reporting ordering issues first; let diff fail the test.
if "".join(lines) != normalized:
sys.stderr.write(
"Note: 'ReleaseNotes.rst' is not normalized; Please fix ordering first.\n"
)
return 0

# Ordering is clean then enforce duplicates.
report = _emit_duplicate_report(lines, "Changes in existing checks")
if report:
sys.stderr.write(report)
return 3
return 0


def process_checks_list(out_path: str, list_doc: str) -> int:
lines = read_text(list_doc)
normalized = normalize_list_rst("".join(lines))
write_text(out_path, normalized)
return 0


def main(argv: Sequence[str]) -> int:
ap = argparse.ArgumentParser()
ap.add_argument("-o", "--output", dest="out", default=None)
args = ap.parse_args(argv)

list_doc, rn_doc = (os.path.normpath(LIST_DOC), os.path.normpath(RELEASE_NOTES_DOC))

if args.out:
out_path = args.out
out_lower = os.path.basename(out_path).lower()
if "release" in out_lower:
return process_release_notes(out_path, rn_doc)
else:
return process_checks_list(out_path, list_doc)

list_lines = read_text(list_doc)
rn_lines = read_text(rn_doc)
list_norm = normalize_list_rst("".join(list_lines))
rn_norm = normalize_release_notes(rn_lines)
if "".join(list_lines) != list_norm:
write_text(list_doc, list_norm)
if "".join(rn_lines) != rn_norm:
write_text(rn_doc, rn_norm)

report = _emit_duplicate_report(rn_lines, "Changes in existing checks")
if report:
sys.stderr.write(report)
return 3
return 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we here reuse process_release_notes and process_checks_list instead of custom logic?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

report = _emit_duplicate_report(rn_lines, "Changes in existing checks")
    if report:
        sys.stderr.write(report)
        return 3
    return 0

It is duplicated here and in process_release_notes.



if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Loading
Loading