From 73f5754c47b8b269aae3c8c9140db6506c9537af Mon Sep 17 00:00:00 2001
From: justinsb <justinsb@google.com>
Date: Tue, 9 Sep 2025 15:21:34 +0000
Subject: [PATCH] WIP: tooling to set copyright headers

---
 dev/tools/fix-boilerplate   |  43 ++++++
 dev/tools/shared/headers.py | 266 ++++++++++++++++++++++++++++++++++++
 2 files changed, 309 insertions(+)
 create mode 100755 dev/tools/fix-boilerplate
 create mode 100755 dev/tools/shared/headers.py

diff --git a/dev/tools/fix-boilerplate b/dev/tools/fix-boilerplate
new file mode 100755
index 000000000..dc98969ed
--- /dev/null
+++ b/dev/tools/fix-boilerplate
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+# Copyright 2025 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import sys
+
+# Assuming fix-boilerplate is in dev/tools and headers.py is in dev/tools/shared
+script_dir = os.path.dirname(os.path.realpath(__file__))
+# Add dev/tools to sys.path to allow importing shared.headers
+sys.path.append(script_dir)
+
+from shared import headers
+
+def main():
+    # Find the repo root from the script's location
+    repo_root = os.path.abspath(os.path.join(script_dir, '..', '..'))
+
+    # Excludes from the original fix-boilerplate script
+    excludes = [
+        '_archived/**',
+        'databases/**',
+        'web/**',
+    ]
+
+    print(f"Scanning for license headers in {repo_root}")
+    headers.apply_headers_to_tree(repo_root, excludes=excludes)
+    print("Done.")
+
+if __name__ == "__main__":
+    main()
diff --git a/dev/tools/shared/headers.py b/dev/tools/shared/headers.py
new file mode 100755
index 000000000..ac3f7fac7
--- /dev/null
+++ b/dev/tools/shared/headers.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+
+# Copyright 2025 The Kubernetes Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import fnmatch
+import argparse
+import datetime
+
+# The license header to apply
+APACHE_HEADER = """Copyright {year} The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+# Mapping of file extensions to their comment syntax
+# (line_prefix, block_start, block_end)
+COMMENT_STYLES = {
+    ".go": ("// ", None, None),
+    ".sh": ("# ", None, None),
+    ".py": ("# ", None, None),
+    ".js": ("// ", None, None),
+    ".ts": ("// ", None, None),
+    ".java": ("// ", None, None),
+    ".scala": ("// ", None, None),
+    ".c": ("// ", None, None),
+    ".h": ("// ", None, None),
+    ".cpp": ("// ", None, None),
+    ".tf": ("# ", None, None),
+    # Block comments for file types that support them
+    ".css": (None, "/*", " */"),
+    ".xml": (None, "<!--", "-->"),
+    ".html": (None, "<!--", "-->"),
+}
+
+# Default glob patterns to exclude, relative to the root directory
+DEFAULT_EXCLUDES = [
+    ".git/**",
+    ".idea/**",
+    "__pycache__/**",
+    "node_modules/**",
+    "vendor/**",
+    "**/*.yaml",
+    "**/*.yml",
+    "**/LICENSE",
+    "**/*.md",
+    "**/OWNERS",
+    "**/SECURITY_CONTACTS",
+    "go.mod",
+    "go.sum",
+    "*.json",
+    "*.pyc",
+    "*.so",
+    "*.o",
+    "*.a",
+    "*.dll",
+    "*.exe",
+    "*.jar",
+    "*.class",
+    "*.zip",
+    "*.tar.gz",
+    "*.tgz",
+    "*.rar",
+    "*.7z",
+    "*.log",
+    "*.sum",
+    "*.DS_Store",
+]
+
+def file_extension_magic(file_path):
+    """Tries to determine the file type, as encoded by a typical extension."""
+    # Default to the file extension
+    _, ext = os.path.splitext(file_path)
+    if ext:
+        return ext
+    # Look for a shebang line
+    with open(file_path, 'r', encoding='utf-8') as f:
+        # Read the first 4k of the file, which should be enough for any header.
+        try:
+            content = f.read(4096)
+        except UnicodeDecodeError:
+            # Likely a binary file
+            return None
+        # First line is shebang (e.g., #!/usr/bin/env python)
+        first_line = content.split('\n', 1)[0]
+        if first_line.startswith("#!"):
+            if "python" in first_line:
+                return ".py"
+            if "bash" in first_line or "sh" in first_line:
+                return ".sh"
+            print((f"unknown shebang in {file_path}: {first_line}"))
+    return None
+
+def get_comment_style(file_extension):
+    """Gets the comment style for a file based on its extension."""
+    return COMMENT_STYLES.get(file_extension)
+
+def format_header(header_text, style):
+    """Formats the header text with the correct comment style."""
+    line_prefix, block_start, block_end = style
+
+    # Add a space for line prefixes if they don't have one
+    if line_prefix and not line_prefix.endswith(' '):
+        line_prefix += ' '
+
+    header_lines = header_text.strip().split('\n')
+
+    if line_prefix:
+        # Handle empty lines in header correctly
+        formatted_lines = [f"{line_prefix}{line}".rstrip() if line else line_prefix.rstrip() for line in header_lines]
+        return '\n'.join(formatted_lines) + '\n\n'
+
+    if block_start and block_end:
+        # Handle block comments
+        formatted_header = f"{block_start}\n"
+        formatted_header += '\n'.join(f" {line}".rstrip() if line else "" for line in header_lines)
+        formatted_header += f"\n{block_end}\n\n"
+        return formatted_header
+
+    return None
+
+
+def has_license_header(file_path):
+    """Checks if a file already has an Apache license header."""
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            # Read the first 4k of the file, which should be enough for any header.
+            content = f.read(4096)
+            if not "Licensed under the Apache License, Version 2.0" in content:
+                return False
+            if not "The Kubernetes Authors" in content:
+                return False
+            return True
+    except Exception as e:
+        # print(f"Could not read file {file_path}: {e}")
+        return True # Skip file on error
+
+
+def apply_license_header(file_path, header_text, dry_run=False):
+    """Applies the license header to a single file if it doesn't have one."""
+
+    file_extension = file_extension_magic(file_path)
+    if not file_extension:
+        # print(f"Skipping (unknown file type): {file_path}")
+        return
+
+
+    if has_license_header(file_path):
+        # print(f"Skipping (header exists): {file_path}")
+        return
+
+    style = get_comment_style(file_extension)
+    if not style:
+        # print(f"Skipping (unsupported extension): {file_path}")
+        return
+
+    formatted_header = format_header(header_text, style)
+    if not formatted_header:
+        # print(f"Skipping (could not format header): {file_path}")
+        return
+
+    print(f"Applying header to: {file_path}")
+    if not dry_run:
+        try:
+            with open(file_path, 'r+', encoding='utf-8') as f:
+                content = f.read()
+                f.seek(0, 0)
+                # Handle shebangs (e.g., #!/usr/bin/env python)
+                if content.startswith("#!"):
+                    lines = content.split('\n', 1)
+                    shebang = lines[0]
+                    rest_of_content = lines[1] if len(lines) > 1 else ""
+                    f.write(shebang + '\n' + formatted_header + rest_of_content)
+                else:
+                    f.write(formatted_header + content)
+        except Exception as e:
+            print(f"Could not write to file {file_path}: {e}")
+
+
+def _match_path_parts(path_parts, pattern_parts):
+    """Recursively matches path components against pattern components."""
+    if not pattern_parts:
+        return not path_parts
+    if not path_parts:
+        return pattern_parts == ['**'] or all(p == '' for p in pattern_parts)
+
+    p_part = pattern_parts[0]
+    if p_part == '**':
+        if len(pattern_parts) == 1:
+            return True  # `/**` at the end matches everything remaining
+        # `/**/` can match zero or more directories.
+        for i in range(len(path_parts) + 1):
+            if _match_path_parts(path_parts[i:], pattern_parts[1:]):
+                return True
+        return False
+    else:
+        if fnmatch.fnmatch(path_parts[0], p_part):
+            return _match_path_parts(path_parts[1:], pattern_parts[1:])
+        return False
+
+def is_path_excluded(relative_path, exclude_patterns):
+    """Checks if a relative path matches any of the .gitignore-style exclude patterns."""
+    relative_path = relative_path.replace(os.path.sep, '/')
+    path_parts = relative_path.split('/')
+
+    for pattern in exclude_patterns:
+        pattern = pattern.replace(os.path.sep, '/')
+        if '/' not in pattern:
+            # If no slash, match against any component of the path
+            if any(fnmatch.fnmatch(part, pattern) for part in path_parts):
+                return True
+        else:
+            # If slash is present, match from the root
+            pattern_parts = pattern.split('/')
+            if _match_path_parts(path_parts, pattern_parts):
+                return True
+    return False
+
+
+def apply_headers_to_tree(root_dir, excludes=None, dry_run=False):
+    """
+    Applies headers to all files in a repository, respecting excludes.
+    """
+    year = datetime.datetime.now().year
+    header_text = APACHE_HEADER.format(year=year)
+
+    all_excludes = DEFAULT_EXCLUDES + (excludes or [])
+    print(f"Excluding patterns: {all_excludes}")
+
+    for root, dirs, files in os.walk(root_dir, topdown=True):
+        rel_root = os.path.relpath(root, root_dir)
+        if rel_root == '.':
+            rel_root = ''
+
+        # Filter dirs in-place so os.walk doesn't recurse into them
+        dirs[:] = [d for d in dirs if not is_path_excluded(os.path.join(rel_root, d), all_excludes)]
+
+        for file in files:
+            rel_path = os.path.join(rel_root, file)
+            if is_path_excluded(rel_path, all_excludes):
+                continue
+
+            full_path = os.path.join(root, file)
+            apply_license_header(full_path, header_text, dry_run)