From 73f5754c47b8b269aae3c8c9140db6506c9537af Mon Sep 17 00:00:00 2001 From: justinsb Date: Tue, 9 Sep 2025 15:21:34 +0000 Subject: [PATCH] WIP: tooling to set copyright headers --- dev/tools/fix-boilerplate | 43 ++++++ dev/tools/shared/headers.py | 266 ++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100755 dev/tools/fix-boilerplate create mode 100755 dev/tools/shared/headers.py diff --git a/dev/tools/fix-boilerplate b/dev/tools/fix-boilerplate new file mode 100755 index 000000000..dc98969ed --- /dev/null +++ b/dev/tools/fix-boilerplate @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import sys + +# Assuming fix-boilerplate is in dev/tools and headers.py is in dev/tools/shared +script_dir = os.path.dirname(os.path.realpath(__file__)) +# Add dev/tools to sys.path to allow importing shared.headers +sys.path.append(script_dir) + +from shared import headers + +def main(): + # Find the repo root from the script's location + repo_root = os.path.abspath(os.path.join(script_dir, '..', '..')) + + # Excludes from the original fix-boilerplate script + excludes = [ + '_archived/**', + 'databases/**', + 'web/**', + ] + + print(f"Scanning for license headers in {repo_root}") + headers.apply_headers_to_tree(repo_root, excludes=excludes) + print("Done.") + +if __name__ == "__main__": + main() diff --git a/dev/tools/shared/headers.py b/dev/tools/shared/headers.py new file mode 100755 index 000000000..ac3f7fac7 --- /dev/null +++ b/dev/tools/shared/headers.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python3 + +# Copyright 2025 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import fnmatch +import argparse +import datetime + +# The license header to apply +APACHE_HEADER = """Copyright {year} The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +# Mapping of file extensions to their comment syntax +# (line_prefix, block_start, block_end) +COMMENT_STYLES = { + ".go": ("// ", None, None), + ".sh": ("# ", None, None), + ".py": ("# ", None, None), + ".js": ("// ", None, None), + ".ts": ("// ", None, None), + ".java": ("// ", None, None), + ".scala": ("// ", None, None), + ".c": ("// ", None, None), + ".h": ("// ", None, None), + ".cpp": ("// ", None, None), + ".tf": ("# ", None, None), + # Block comments for file types that support them + ".css": (None, "/*", " */"), + ".xml": (None, ""), + ".html": (None, ""), +} + +# Default glob patterns to exclude, relative to the root directory +DEFAULT_EXCLUDES = [ + ".git/**", + ".idea/**", + "__pycache__/**", + "node_modules/**", + "vendor/**", + "**/*.yaml", + "**/*.yml", + "**/LICENSE", + "**/*.md", + "**/OWNERS", + "**/SECURITY_CONTACTS", + "go.mod", + "go.sum", + "*.json", + "*.pyc", + "*.so", + "*.o", + "*.a", + "*.dll", + "*.exe", + "*.jar", + "*.class", + "*.zip", + "*.tar.gz", + "*.tgz", + "*.rar", + "*.7z", + "*.log", + "*.sum", + "*.DS_Store", +] + +def file_extension_magic(file_path): + """Tries to determine the file type, as encoded by a typical extension.""" + # Default to the file extension + _, ext = os.path.splitext(file_path) + if ext: + return ext + # Look for a shebang line + with open(file_path, 'r', encoding='utf-8') as f: + # Read the first 4k of the file, which should be enough for any header. + try: + content = f.read(4096) + except UnicodeDecodeError: + # Likely a binary file + return None + # First line is shebang (e.g., #!/usr/bin/env python) + first_line = content.split('\n', 1)[0] + if first_line.startswith("#!"): + if "python" in first_line: + return ".py" + if "bash" in first_line or "sh" in first_line: + return ".sh" + print((f"unknown shebang in {file_path}: {first_line}")) + return None + +def get_comment_style(file_extension): + """Gets the comment style for a file based on its extension.""" + return COMMENT_STYLES.get(file_extension) + +def format_header(header_text, style): + """Formats the header text with the correct comment style.""" + line_prefix, block_start, block_end = style + + # Add a space for line prefixes if they don't have one + if line_prefix and not line_prefix.endswith(' '): + line_prefix += ' ' + + header_lines = header_text.strip().split('\n') + + if line_prefix: + # Handle empty lines in header correctly + formatted_lines = [f"{line_prefix}{line}".rstrip() if line else line_prefix.rstrip() for line in header_lines] + return '\n'.join(formatted_lines) + '\n\n' + + if block_start and block_end: + # Handle block comments + formatted_header = f"{block_start}\n" + formatted_header += '\n'.join(f" {line}".rstrip() if line else "" for line in header_lines) + formatted_header += f"\n{block_end}\n\n" + return formatted_header + + return None + + +def has_license_header(file_path): + """Checks if a file already has an Apache license header.""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + # Read the first 4k of the file, which should be enough for any header. + content = f.read(4096) + if not "Licensed under the Apache License, Version 2.0" in content: + return False + if not "The Kubernetes Authors" in content: + return False + return True + except Exception as e: + # print(f"Could not read file {file_path}: {e}") + return True # Skip file on error + + +def apply_license_header(file_path, header_text, dry_run=False): + """Applies the license header to a single file if it doesn't have one.""" + + file_extension = file_extension_magic(file_path) + if not file_extension: + # print(f"Skipping (unknown file type): {file_path}") + return + + + if has_license_header(file_path): + # print(f"Skipping (header exists): {file_path}") + return + + style = get_comment_style(file_extension) + if not style: + # print(f"Skipping (unsupported extension): {file_path}") + return + + formatted_header = format_header(header_text, style) + if not formatted_header: + # print(f"Skipping (could not format header): {file_path}") + return + + print(f"Applying header to: {file_path}") + if not dry_run: + try: + with open(file_path, 'r+', encoding='utf-8') as f: + content = f.read() + f.seek(0, 0) + # Handle shebangs (e.g., #!/usr/bin/env python) + if content.startswith("#!"): + lines = content.split('\n', 1) + shebang = lines[0] + rest_of_content = lines[1] if len(lines) > 1 else "" + f.write(shebang + '\n' + formatted_header + rest_of_content) + else: + f.write(formatted_header + content) + except Exception as e: + print(f"Could not write to file {file_path}: {e}") + + +def _match_path_parts(path_parts, pattern_parts): + """Recursively matches path components against pattern components.""" + if not pattern_parts: + return not path_parts + if not path_parts: + return pattern_parts == ['**'] or all(p == '' for p in pattern_parts) + + p_part = pattern_parts[0] + if p_part == '**': + if len(pattern_parts) == 1: + return True # `/**` at the end matches everything remaining + # `/**/` can match zero or more directories. + for i in range(len(path_parts) + 1): + if _match_path_parts(path_parts[i:], pattern_parts[1:]): + return True + return False + else: + if fnmatch.fnmatch(path_parts[0], p_part): + return _match_path_parts(path_parts[1:], pattern_parts[1:]) + return False + +def is_path_excluded(relative_path, exclude_patterns): + """Checks if a relative path matches any of the .gitignore-style exclude patterns.""" + relative_path = relative_path.replace(os.path.sep, '/') + path_parts = relative_path.split('/') + + for pattern in exclude_patterns: + pattern = pattern.replace(os.path.sep, '/') + if '/' not in pattern: + # If no slash, match against any component of the path + if any(fnmatch.fnmatch(part, pattern) for part in path_parts): + return True + else: + # If slash is present, match from the root + pattern_parts = pattern.split('/') + if _match_path_parts(path_parts, pattern_parts): + return True + return False + + +def apply_headers_to_tree(root_dir, excludes=None, dry_run=False): + """ + Applies headers to all files in a repository, respecting excludes. + """ + year = datetime.datetime.now().year + header_text = APACHE_HEADER.format(year=year) + + all_excludes = DEFAULT_EXCLUDES + (excludes or []) + print(f"Excluding patterns: {all_excludes}") + + for root, dirs, files in os.walk(root_dir, topdown=True): + rel_root = os.path.relpath(root, root_dir) + if rel_root == '.': + rel_root = '' + + # Filter dirs in-place so os.walk doesn't recurse into them + dirs[:] = [d for d in dirs if not is_path_excluded(os.path.join(rel_root, d), all_excludes)] + + for file in files: + rel_path = os.path.join(rel_root, file) + if is_path_excluded(rel_path, all_excludes): + continue + + full_path = os.path.join(root, file) + apply_license_header(full_path, header_text, dry_run)