Skip to content

Commit d795f7c

Browse files
committed
[feat] add format_header
1. still incomplete file. Signed-off-by: clundro <[email protected]>
1 parent cda2f2b commit d795f7c

File tree

2 files changed

+396
-0
lines changed

2 files changed

+396
-0
lines changed

build_support/format_header.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
#!/usr/bin/env python3
2+
# encoding: utf-8
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one
5+
# or more contributor license agreements. See the NOTICE file
6+
# distributed with this work for additional information
7+
# regarding copyright ownership. The ASF licenses this file
8+
# to you under the Apache License, Version 2.0 (the
9+
# "License"); you may not use this file except in compliance
10+
# with the License. You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing,
15+
# software distributed under the License is distributed on an
16+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
# KIND, either express or implied. See the License for the
18+
# specific language governing permissions and limitations
19+
# under the License.
20+
#
21+
# Modified from the Apache Arrow project for the Terrier project.
22+
23+
"""Format the ill-formatted code."""
24+
# ==============================================
25+
# GOAL : Format code, Update headers
26+
# ==============================================
27+
28+
# ===----------------------------------------------------------------------===//
29+
#
30+
# CMU-DB Project (15-445/645)
31+
# ***DO NO SHARE PUBLICLY***
32+
#
33+
# Identification: src/include/page/b_plus_tree_page.h
34+
#
35+
# Copyright (c) 2023, Carnegie Mellon University Database Group
36+
#
37+
# ===----------------------------------------------------------------------===//
38+
39+
40+
# ref: https://github.com/cmu-db/bustub/blob/master/script/formatting/formatter.py
41+
42+
43+
import argparse
44+
import logging
45+
import os
46+
import re
47+
import sys
48+
import datetime
49+
import subprocess
50+
from functools import reduce
51+
from helpers import CLANG_FORMAT, BUSTUB_DIR, CLANG_FORMAT_FILE, LOG,\
52+
clang_format, hunks_from_staged_files, hunks_from_last_commits
53+
54+
55+
BUSTUB_SRC_DIR = os.path.join(BUSTUB_DIR, "src")
56+
BUSTUB_TESTS_DIR = os.path.join(BUSTUB_DIR, "test")
57+
58+
# DEFAULT DIRS
59+
DEFAULT_DIRS = []
60+
DEFAULT_DIRS.append(BUSTUB_SRC_DIR)
61+
DEFAULT_DIRS.append(BUSTUB_TESTS_DIR)
62+
63+
# header framework, dynamic information will be added inside function
64+
header_comment_line_1 = "//===----------------------------------------------------------------------===//\n"
65+
header_comment_line_1 += "//\n"
66+
header_comment_line_1 += "// CMU-DB Project (15-445/645)\n"
67+
header_comment_line_2 = "// ***DO NO SHARE PUBLICLY***\n"
68+
header_comment_line_3 = "// "
69+
header_comment_line_4 = "//\n"
70+
header_comment_line_5 = "// Identification: "
71+
header_comment_line_6 = "//\n"
72+
header_comment_line_7 = "// Copyright (c) %d, Carnegie Mellon University Database Group\n" % datetime.datetime.now().year
73+
header_comment_line_8 = "//\n"
74+
header_comment_line_9 = "//===----------------------------------------------------------------------===//\n\n"
75+
76+
header_comment_1 = header_comment_line_1 + header_comment_line_2
77+
header_comment_3 = header_comment_line_4
78+
header_comment_5 = header_comment_line_6 + header_comment_line_7 \
79+
+ header_comment_line_8 + header_comment_line_9
80+
81+
HEADER_REGEX = re.compile(
82+
r"((\/\/===-*===\/\/\n(\/\/.*\n)*\/\/===-*===\/\/[\n]*)\n\n)*")
83+
84+
85+
# ==============================================
86+
# UTILITY FUNCTION DEFINITIONS
87+
# ==============================================
88+
89+
90+
def format_file(file_path, file_hunks, update_header, clang_format_code):
91+
"""Formats the file passed as argument."""
92+
file_name = os.path.basename(file_path)
93+
abs_path = os.path.abspath(file_path)
94+
rel_path_from_bustub_dir = os.path.relpath(abs_path, BUSTUB_DIR)
95+
96+
with open(file_path, "r+") as file:
97+
file_data = file.read()
98+
99+
if update_header:
100+
# strip old header if it exists
101+
header_match = HEADER_REGEX.match(file_data)
102+
if not header_match is None:
103+
LOG.info("Strip header from %s", file_name)
104+
header_comment = header_match.group()
105+
LOG.debug("Header comment : %s", header_comment)
106+
file_data = file_data.replace(header_comment, "")
107+
108+
# add new header
109+
LOG.info("Add header to %s", file_name)
110+
header_comment_2 = header_comment_line_3 + file_name + "\n"
111+
header_comment_4 = header_comment_line_5\
112+
+ rel_path_from_bustub_dir + "\n"
113+
header_comment = header_comment_1 + header_comment_2 \
114+
+ header_comment_3 + header_comment_4 \
115+
+ header_comment_5
116+
# print header_comment
117+
118+
file_data = header_comment + file_data
119+
120+
file.seek(0, 0)
121+
file.truncate()
122+
file.write(file_data)
123+
124+
elif clang_format_code:
125+
clang_format(file_path, file_hunks)
126+
127+
# END WITH
128+
# END FORMAT__FILE(FILE_NAME)
129+
130+
131+
def format_dir(dir_path, update_header, clang_format_code):
132+
"""Formats all the files in the dir passed as argument."""
133+
for subdir, _, files in os.walk(dir_path): # _ is for directories.
134+
for file in files:
135+
# print os.path.join(subdir, file)
136+
file_path = subdir + os.path.sep + file
137+
138+
if file_path.endswith(".h") or file_path.endswith(".cpp"):
139+
format_file(file_path, None, update_header, clang_format_code)
140+
# END IF
141+
# END FOR [file]
142+
# END FOR [os.walk]
143+
# END ADD_HEADERS_DIR(DIR_PATH)
144+
145+
146+
# ==============================================
147+
# Main Function
148+
# ==============================================
149+
150+
if __name__ == '__main__':
151+
152+
PARSER = argparse.ArgumentParser(
153+
description='Update headers and/or format source code'
154+
)
155+
156+
PARSER.add_argument(
157+
"-u", "--update-header",
158+
help='Action: Update existing headers or add new ones',
159+
action='store_true'
160+
)
161+
PARSER.add_argument(
162+
"-c", "--clang-format-code",
163+
help='Action: Apply clang-format to source code',
164+
action='store_true'
165+
)
166+
PARSER.add_argument(
167+
"-f", "--staged-files",
168+
help='Action: Apply the selected action(s) to all staged files (git). ' +
169+
'(clang-format will only touch the staged lines)',
170+
action='store_true'
171+
)
172+
PARSER.add_argument(
173+
"-n", "--number-commits",
174+
help='Action: Apply the selected action(s) to all changes of the last ' +
175+
'<n> commits (clang-format will only touch the changed lines)',
176+
type=int, default=0
177+
)
178+
PARSER.add_argument(
179+
'paths', metavar='PATH', type=str, nargs='*',
180+
help='Files or directories to (recursively) apply the actions to'
181+
)
182+
183+
ARGS = PARSER.parse_args()
184+
185+
# TARGETS is a list of files with an optional list of hunks, represented as
186+
# pair (start, end) of line numbers, 1 based.
187+
# element of TARGETS: (filename, None) or (filename, [(start,end)])
188+
189+
if ARGS.staged_files:
190+
TARGETS = hunks_from_staged_files()
191+
192+
if not TARGETS:
193+
LOG.error(
194+
"no staged files or not calling from a repository -- exiting"
195+
)
196+
sys.exit("no staged files or not calling from a repository")
197+
198+
elif ARGS.number_commits > 0:
199+
TARGETS = hunks_from_last_commits(ARGS.number_commits)
200+
201+
if not TARGETS:
202+
LOG.error(
203+
"no changes could be extracted for formatting -- exiting"
204+
)
205+
sys.exit("no changes could be extracted for formatting")
206+
207+
elif not ARGS.paths:
208+
LOG.error("no files or directories given -- exiting")
209+
sys.exit("no files or directories given")
210+
211+
else:
212+
TARGETS = [(f, None) for f in ARGS.paths]
213+
214+
for f, hunks in TARGETS:
215+
if os.path.isfile(f):
216+
LOG.info("Scanning file: %s", f)
217+
format_file(f, hunks, ARGS.update_header, ARGS.clang_format_code)
218+
elif os.path.isdir(f):
219+
LOG.info("Scanning directory %s", f)
220+
format_dir(f, ARGS.update_header, ARGS.clang_format_code)
221+
# FOR
222+
# IF

build_support/helpers.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#!/usr/bin/env python3
2+
"""Common helper functions to be used in different Python scripts."""
3+
import difflib
4+
import distutils.spawn
5+
import logging
6+
import os
7+
import subprocess
8+
import re
9+
10+
from functools import reduce
11+
12+
CODE_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
13+
BUSTUB_DIR = CODE_SOURCE_DIR.replace('/build_support', '')
14+
CLANG_FORMAT_FILE = os.path.join(BUSTUB_DIR, ".clang-format")
15+
16+
FORMATTING_FILE_WHITELIST = [
17+
# Fill me
18+
]
19+
20+
DIFF_FILE_PATT = re.compile(r'^\+\+\+ b\/(.*)')
21+
DIFF_HUNK_PATT = re.compile(r'^@@ \-\d+(,\d+)? \+(\d+)(,)?(\d+)? @@.*')
22+
23+
# ==============================================
24+
# LOGGING CONFIGURATION
25+
# ==============================================
26+
27+
LOG = logging.getLogger(__name__)
28+
LOG_HANDLER = logging.StreamHandler()
29+
LOG_FORMATTER = logging.Formatter(
30+
fmt='%(asctime)s [%(funcName)s:%(lineno)03d] %(levelname)-5s: %(message)s',
31+
datefmt='%m-%d-%Y %H:%M:%S'
32+
)
33+
LOG_HANDLER.setFormatter(LOG_FORMATTER)
34+
LOG.addHandler(LOG_HANDLER)
35+
LOG.setLevel(logging.INFO)
36+
37+
38+
def find_clangformat():
39+
"""Finds appropriate clang-format executable."""
40+
# check for possible clang-format versions
41+
path = ""
42+
for exe in ["clang-format", "clang-format-13", "clang-format-14"]:
43+
path = distutils.spawn.find_executable(exe)
44+
if not path is None:
45+
break
46+
return path
47+
48+
49+
CLANG_FORMAT = find_clangformat()
50+
CLANG_COMMAND_PREFIX = [CLANG_FORMAT, "-style=file"]
51+
52+
53+
def clang_check(file_path, hunks=None):
54+
"""Checks and reports bad code formatting."""
55+
56+
assert not file_path is None and not file_path == ""
57+
58+
rel_path_from_peloton_dir = os.path.relpath(file_path, BUSTUB_DIR)
59+
60+
if rel_path_from_peloton_dir in FORMATTING_FILE_WHITELIST:
61+
return True
62+
63+
file_status = True
64+
65+
# Run clang-format on the file and get output (not inline!)
66+
formatted_src = clang_format(file_path, None, inline=False)
67+
68+
# For Python 3, the above command gives a list of binary sequences, each
69+
# of which has to be converted to string for diff to operate correctly.
70+
# Otherwise, strings would be compared with binary sequences and there
71+
# will always be a big difference.
72+
formatted_src = [line.decode('utf-8') for line in formatted_src]
73+
# Load source file
74+
with open(file_path, "r") as file:
75+
src = file.readlines()
76+
77+
# Do the diff
78+
difference = difflib.Differ()
79+
diff = difference.compare(src, formatted_src)
80+
line_num = 0
81+
for line in diff:
82+
code = line[:2]
83+
if code in (" ", "- "):
84+
line_num += 1
85+
if code == '- ':
86+
if file_status:
87+
LOG.info("Invalid formatting in file : " + file_path)
88+
LOG.info("Line %d: %s", line_num, line[2:].strip())
89+
file_status = False
90+
91+
return file_status
92+
93+
94+
def clang_format(file_path, hunks=None, inline=True):
95+
"""Formats the file at file_path.
96+
'hunks' can be a list of pairs with (start,end) line numbers, 1 based.
97+
"""
98+
99+
assert not file_path is None and not file_path == ""
100+
101+
if CLANG_FORMAT is None:
102+
LOG.error("clang-format seems not installed")
103+
exit()
104+
105+
formatting_command = CLANG_COMMAND_PREFIX + [file_path]
106+
107+
if inline:
108+
formatting_command.append("-i")
109+
110+
if not hunks is None:
111+
for start, end in hunks:
112+
if start > 0 and end > 0:
113+
formatting_command.append("-lines={}:{}".format(start, end))
114+
115+
LOG.info(' '.join(formatting_command))
116+
output = subprocess.check_output(formatting_command).splitlines(True)
117+
return output
118+
119+
120+
def hunks_from_last_commits(n):
121+
""" Extract hunks of the last n commits. """
122+
123+
assert n > 0
124+
125+
diff_output = subprocess.check_output(["git", "diff", "HEAD~"+str(n), "--diff-filter=d", "--unified=0"]
126+
).decode("utf-8").splitlines()
127+
128+
return _hunks_from_diff(diff_output)
129+
130+
131+
def hunks_from_staged_files():
132+
diff_output = subprocess.check_output(["git", "diff", "HEAD",
133+
"--cached", "--diff-filter=d", "--unified=0"]
134+
).decode("utf-8").splitlines()
135+
136+
return _hunks_from_diff(diff_output)
137+
138+
139+
def _hunks_from_diff(diff_output):
140+
""" Parse a diff output and extract the hunks of changed files.
141+
The diff output must not have additional lines!
142+
(use --unified=0) """
143+
144+
# TARGETS is a list of files with an optional list of hunks, represented as
145+
# pair (start, end) of line numbers, 1 based.
146+
# element of TARGETS: (filename, None) or (filename, [(start,end)])
147+
target_files = []
148+
149+
# hunks_current_list serves as a reference to the hunks list of the
150+
# last added file
151+
hunks_current_list = []
152+
153+
for line in diff_output:
154+
file_match = DIFF_FILE_PATT.search(line)
155+
hunk_match = DIFF_HUNK_PATT.search(line)
156+
if file_match:
157+
file_path = os.path.abspath(os.path.join(BUSTUB_DIR,
158+
file_match.group(1)))
159+
160+
hunks_current_list = []
161+
if file_path.endswith(".h") or file_path.endswith(".cpp"):
162+
target_files.append((file_path, hunks_current_list))
163+
# If this file is not .cpp/.h the hunks_current_list reference
164+
# will point to an empty list which will be discarded later
165+
elif hunk_match:
166+
# add entry in the hunk list of the last file
167+
if hunk_match.group(4) is None:
168+
hunk = (int(hunk_match.group(2)), int(hunk_match.group(2)))
169+
else:
170+
hunk = (int(hunk_match.group(2)), int(hunk_match.group(2)) +
171+
int(hunk_match.group(4)))
172+
hunks_current_list.append(hunk)
173+
174+
return target_files

0 commit comments

Comments
 (0)