Skip to content

Commit 6a81f73

Browse files
authored
refactor: format code with black and do more intuitive view of the python report table
- Format with black; - Handle AnnAssign, AugAssign, Assign classes and parse it to descriptive names for pretty view in the report table; - Add lineno into report table for python works; - Add missing unit tests. Refs: #175
1 parent 612b5ab commit 6a81f73

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1713
-1896
lines changed

.pre-commit-config.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
default_language_version:
22
python: python3.8
33
repos:
4+
- repo: https://github.com/psf/black
5+
rev: 23.9.1
6+
hooks:
7+
- id: black
48
- repo: https://github.com/astral-sh/ruff-pre-commit
59
rev: v0.0.287
610
hooks:

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
UTIL_VERSION := 0.3.6
1+
UTIL_VERSION := 0.3.7
22
UTIL_NAME := codeplag
33
PWD := $(shell pwd)
44

@@ -10,6 +10,7 @@ BASE_DOCKER_TAG := $(shell echo $(UTIL_NAME)-base-ubuntu20.04:$(BASE_DOC
1010
TEST_DOCKER_TAG := $(shell echo $(UTIL_NAME)-test-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
1111
DOCKER_TAG ?= $(shell echo $(UTIL_NAME)-ubuntu20.04:$(UTIL_VERSION) | tr A-Z a-z)
1212

13+
PYTHONDONTWRITEBYTECODE := "1"
1314
PYTHONPATH := $(PWD)/src/:$(PWD)/test/auto
1415

1516
LOGS_PATH := /var/log/$(UTIL_NAME)

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@
116116
$ register-python-argcomplete codeplag >> ~/.bashrc
117117
```
118118

119-
## 5. Demo examples (works in the project directory and with an installed codeplag package)
119+
## 4. Demo examples (works in the project directory and with an installed codeplag package)
120120

121121
- Python analyzer
122122
```

docs/notebooks/utils.py

Lines changed: 54 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -34,15 +34,14 @@ def remove_unnecessary_blank_lines(source_code: str) -> str:
3434

3535

3636
def get_data_from_dir(
37-
path: str = './data',
38-
max_count_lines: Optional[int] = None
37+
path: str = "./data", max_count_lines: Optional[int] = None
3938
) -> pd.DataFrame:
4039
df = pd.DataFrame()
4140
for filename in os.listdir(path):
42-
if not re.search(r'.csv$', filename):
41+
if not re.search(r".csv$", filename):
4342
continue
4443

45-
tmp_df = pd.read_csv(os.path.join(path, filename), sep=';', index_col=0)
44+
tmp_df = pd.read_csv(os.path.join(path, filename), sep=";", index_col=0)
4645
df = df.append(tmp_df, ignore_index=True)
4746

4847
if max_count_lines:
@@ -52,43 +51,43 @@ def get_data_from_dir(
5251

5352

5453
def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
55-
current_repo_name = url.split('/')[-1]
56-
env_config = Config(RepositoryEnv('../../.env'))
54+
current_repo_name = url.split("/")[-1]
55+
env_config = Config(RepositoryEnv("../../.env"))
5756
gh = GitHubParser(
58-
file_extensions=(re.compile(r'.py$'),),
57+
file_extensions=(re.compile(r".py$"),),
5958
check_all=check_policy,
60-
access_token=env_config.get('ACCESS_TOKEN')
59+
access_token=env_config.get("ACCESS_TOKEN"),
6160
)
6261
files = list(gh.get_files_generator_from_repo_url(url))
6362
files = [(remove_unnecessary_blank_lines(file.code), file.link) for file in files]
6463

6564
df = pd.DataFrame(
6665
{
67-
'content': [file_[0] for file_ in files[:-1]],
68-
'link': [file_[1] for file_ in files[:-1]],
69-
'extension': ['py'] * (len(files) - 1),
70-
'repo_name': [current_repo_name] * (len(files) - 1),
71-
'content_len': [len(file_[0]) for file_ in files[:-1]],
72-
'content_len_without_blank': [
73-
len(file_[0].replace(' ', '').replace('\n', '').replace('\t', ''))
66+
"content": [file_[0] for file_ in files[:-1]],
67+
"link": [file_[1] for file_ in files[:-1]],
68+
"extension": ["py"] * (len(files) - 1),
69+
"repo_name": [current_repo_name] * (len(files) - 1),
70+
"content_len": [len(file_[0]) for file_ in files[:-1]],
71+
"content_len_without_blank": [
72+
len(file_[0].replace(" ", "").replace("\n", "").replace("\t", ""))
7473
for file_ in files[:-1]
7574
],
76-
'count_lines_without_blank_lines': [
75+
"count_lines_without_blank_lines": [
7776
len(file_[0].splitlines()) for file_ in files[:-1]
78-
]
77+
],
7978
}
8079
)
81-
df = df[df['count_lines_without_blank_lines'] > 5]
82-
df.to_csv(os.path.join('./data/', current_repo_name + '.csv'), sep=';')
80+
df = df[df["count_lines_without_blank_lines"] > 5]
81+
df.to_csv(os.path.join("./data/", current_repo_name + ".csv"), sep=";")
8382

8483

8584
def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
8685
count_lines = []
8786
to_meta_time = []
88-
for (index, content) in df[
89-
['content', 'link', 'count_lines_without_blank_lines']
87+
for index, content in df[
88+
["content", "link", "count_lines_without_blank_lines"]
9089
].iterrows():
91-
print(index, " " * 20, end='\r')
90+
print(index, " " * 20, end="\r")
9291
for _ in range(iterations):
9392
tree = get_ast_from_content(content[0], content[1])
9493
if tree is None:
@@ -102,12 +101,7 @@ def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
102101
except Exception:
103102
break
104103

105-
output = pd.DataFrame(
106-
{
107-
'count_lines': count_lines,
108-
'times': to_meta_time
109-
}
110-
)
104+
output = pd.DataFrame({"count_lines": count_lines, "times": to_meta_time})
111105

112106
return output
113107

@@ -118,7 +112,7 @@ def plot_and_save_result(
118112
ylabel: str,
119113
title: str,
120114
what: str,
121-
trend: Literal['linear', 'n^2', 'n^3', 'n^4'] = 'linear'
115+
trend: Literal["linear", "n^2", "n^3", "n^4"] = "linear",
122116
) -> None:
123117
# Simple Moving average
124118
unique_count_lines = np.unique(df.count_lines)
@@ -135,75 +129,72 @@ def plot_and_save_result(
135129
plt.figure(figsize=(12, 12), dpi=80)
136130
# plt.plot(unique_count_lines, mean_times, label='Среднее')
137131

138-
if trend == 'linear':
132+
if trend == "linear":
139133
z = np.polyfit(unique_count_lines, mean_times, 1)
140134
p = np.poly1d(z)
141135
plt.plot(
142-
unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.'
136+
unique_count_lines, p(unique_count_lines), "r--", label="Линейный тренд."
143137
)
144-
elif trend == 'n^2':
138+
elif trend == "n^2":
145139
popt_cons, _ = curve_fit(
146140
square_func,
147141
unique_count_lines,
148142
mean_times,
149-
bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])
143+
bounds=([-np.inf, 0.0, 0.0], [np.inf, 0.1**100, 0.1**100]),
150144
)
151145
p = np.poly1d(popt_cons)
152146
plt.plot(
153147
unique_count_lines,
154148
p(unique_count_lines),
155-
"r--", label='Квадратичный тренд.'
149+
"r--",
150+
label="Квадратичный тренд.",
156151
)
157-
elif trend == 'n^3':
152+
elif trend == "n^3":
158153
popt_cons, _ = curve_fit(
159154
cube_func,
160155
unique_count_lines,
161156
mean_times,
162-
bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])
157+
bounds=(
158+
[-np.inf, 0.0, 0.0, 0.0],
159+
[np.inf, 0.1**100, 0.1**100, 0.1**100],
160+
),
163161
)
164162
p = np.poly1d(popt_cons)
165163
plt.plot(
166-
unique_count_lines,
167-
p(unique_count_lines),
168-
"r--",
169-
label='Кубический тренд.'
164+
unique_count_lines, p(unique_count_lines), "r--", label="Кубический тренд."
170165
)
171-
elif trend == 'n^4':
166+
elif trend == "n^4":
172167
popt_cons, _ = curve_fit(
173168
quart_func,
174169
unique_count_lines,
175170
mean_times,
176171
bounds=(
177-
[-np.inf, 0., 0., 0., 0.],
178-
[np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]
179-
)
172+
[-np.inf, 0.0, 0.0, 0.0, 0.0],
173+
[np.inf, 0.1**100, 0.1**100, 0.1**100, 0.1**100],
174+
),
180175
)
181176
p = np.poly1d(popt_cons)
182-
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
177+
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label="n^4.")
183178
else:
184179
raise Exception(f"Incorrect tred '{trend}'.")
185180

186181
rolling = pd.DataFrame(
187-
{
188-
'unique_count_lines': unique_count_lines,
189-
'mean_times': mean_times
190-
}
182+
{"unique_count_lines": unique_count_lines, "mean_times": mean_times}
191183
)
192184
num_window = 20
193185
plt.plot(
194186
rolling.unique_count_lines,
195187
rolling.mean_times.rolling(window=num_window).mean(),
196-
label=f'Скользящее среднее по {num_window}ти замерам.'
188+
label=f"Скользящее среднее по {num_window}ти замерам.",
197189
)
198190

199191
plt.ylabel(ylabel, fontsize=15)
200192
plt.xlabel(xlabel, fontsize=15)
201193
plt.title(title, fontsize=17)
202-
plt.legend(loc='upper left')
194+
plt.legend(loc="upper left")
203195
plt.savefig(
204-
'./graphics/need_time_{}_{}.png'.format(
205-
what,
206-
datetime.now().strftime("%d%m%Y_%H%M%S")
196+
"./graphics/need_time_{}_{}.png".format(
197+
what, datetime.now().strftime("%d%m%Y_%H%M%S")
207198
)
208199
)
209200

@@ -212,7 +203,7 @@ def get_time_algorithms(
212203
df: pd.DataFrame,
213204
work,
214205
iterations: int = 5,
215-
metric: Literal['fast', 'gst', 'structure'] = 'fast'
206+
metric: Literal["fast", "gst", "structure"] = "fast",
216207
) -> pd.DataFrame:
217208
count_lines = []
218209
times = []
@@ -221,11 +212,11 @@ def get_time_algorithms(
221212
raise Exception("Unexpected error when parsing first work.")
222213

223214
features1 = get_features_from_ast(tree1, work.link)
224-
for (index, content) in df[
225-
['content', 'link', 'count_lines_without_blank_lines']
215+
for index, content in df[
216+
["content", "link", "count_lines_without_blank_lines"]
226217
].iterrows():
227218
for _ in range(iterations):
228-
print(index, " " * 20, end='\r')
219+
print(index, " " * 20, end="\r")
229220
tree2 = get_ast_from_content(content[0], content[1])
230221
if tree2 is None:
231222
continue
@@ -234,34 +225,29 @@ def get_time_algorithms(
234225
except Exception:
235226
continue
236227

237-
if metric == 'fast':
228+
if metric == "fast":
238229
start = perf_counter()
239230
value_jakkar_coef(features1.tokens, features2.tokens)
240231
counter_metric(features1.operators, features2.operators)
241232
counter_metric(features1.keywords, features2.keywords)
242233
counter_metric(features1.literals, features2.literals)
243234
end = perf_counter() - start
244235
times.append(end)
245-
elif metric == 'gst':
236+
elif metric == "gst":
246237
start = perf_counter()
247238
gst(features1.tokens, features2.tokens, 6)
248239
end = perf_counter() - start
249240
times.append(end)
250-
elif metric == 'structure':
241+
elif metric == "structure":
251242
start = perf_counter()
252243
struct_compare(features1.structure, features2.structure)
253244
end = perf_counter() - start
254245
times.append(end)
255246
else:
256-
raise Exception('Incorrect metric!')
247+
raise Exception("Incorrect metric!")
257248

258249
count_lines.append(content[2])
259250

260-
output = pd.DataFrame(
261-
{
262-
'count_lines': count_lines,
263-
'times': times
264-
}
265-
)
251+
output = pd.DataFrame({"count_lines": count_lines, "times": times})
266252

267253
return output

setup.py

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,58 +5,58 @@
55
from setuptools import find_packages, setup
66

77
INSTALL_REQUIREMENTS = [
8-
'argcomplete~=2.0.0',
9-
'numpy~=1.23.5',
10-
'pandas~=1.4.3',
11-
'ccsyspath~=1.1.0',
12-
'clang~=16.0.1.1',
13-
'llvmlite~=0.40.1',
14-
'libclang~=16.0.0',
15-
'python-decouple~=3.6',
16-
'requests~=2.31.0',
17-
'typing-extensions~=4.3.0',
18-
'aiohttp~=3.8.5',
19-
'cachetools==5.3.1',
20-
'gidgethub~=5.3.0',
8+
"argcomplete~=2.0.0",
9+
"numpy~=1.23.5",
10+
"pandas~=1.4.3",
11+
"ccsyspath~=1.1.0",
12+
"clang~=16.0.1.1",
13+
"llvmlite~=0.40.1",
14+
"libclang~=16.0.0",
15+
"python-decouple~=3.6",
16+
"requests~=2.31.0",
17+
"typing-extensions~=4.3.0",
18+
"aiohttp~=3.8.5",
19+
"cachetools==5.3.1",
20+
"gidgethub~=5.3.0",
2121
]
22-
UTIL_NAME = os.getenv('UTIL_NAME')
23-
UTIL_VERSION = os.getenv('UTIL_VERSION')
22+
UTIL_NAME = os.getenv("UTIL_NAME")
23+
UTIL_VERSION = os.getenv("UTIL_VERSION")
2424

2525

26-
if '--install-requirements' in sys.argv:
27-
print(' '.join(INSTALL_REQUIREMENTS))
26+
if "--install-requirements" in sys.argv:
27+
print(" ".join(INSTALL_REQUIREMENTS))
2828
sys.exit(0)
2929
elif UTIL_NAME is None or UTIL_VERSION is None:
30-
print('Please provide UTIL_NAME and UTIL_VERSION environment variables.')
30+
print("Please provide UTIL_NAME and UTIL_VERSION environment variables.")
3131
sys.exit(1)
3232

3333

3434
setup(
35-
name=f'{UTIL_NAME}',
36-
version=f'{UTIL_VERSION}',
37-
description='Code plagiarism searching package',
38-
author='Artyom Semidolin, Dmitry Nikolaev, Alexander Evsikov',
39-
url='https://github.com/OSLL/code-plagiarism',
35+
name=f"{UTIL_NAME}",
36+
version=f"{UTIL_VERSION}",
37+
description="Code plagiarism searching package",
38+
author="Artyom Semidolin, Dmitry Nikolaev, Alexander Evsikov",
39+
url="https://github.com/OSLL/code-plagiarism",
4040
long_description=Path("README.md").read_text(encoding="utf-8"),
4141
long_description_content_type="text/markdown",
42-
license='MIT License',
42+
license="MIT License",
4343
platforms=["linux"],
4444
classifiers=[
45-
'Development Status :: 4 - Beta',
46-
'Environment :: Console',
47-
'Intended Audience :: Education',
48-
'License :: OSI Approved :: MIT License',
49-
'Programming Language :: Python :: 3',
50-
'Programming Language :: Python :: 3.8',
51-
'Topic :: Software Development :: Plagiarism Detection',
45+
"Development Status :: 4 - Beta",
46+
"Environment :: Console",
47+
"Intended Audience :: Education",
48+
"License :: OSI Approved :: MIT License",
49+
"Programming Language :: Python :: 3",
50+
"Programming Language :: Python :: 3.8",
51+
"Topic :: Software Development :: Plagiarism Detection",
5252
],
5353
package_dir={"": "src"},
5454
packages=find_packages("src"),
55-
python_requires='>=3.8',
55+
python_requires=">=3.8",
5656
install_requires=INSTALL_REQUIREMENTS,
5757
entry_points={
58-
'console_scripts': [
59-
'codeplag = codeplag:main',
58+
"console_scripts": [
59+
"codeplag = codeplag:main",
6060
]
61-
}
61+
},
6262
)

0 commit comments

Comments
 (0)