Skip to content

Commit 612b5ab

Browse files
authored
feat: unite reports in the csv file
- Save check reports in the csv file; - Now used ruff instead of use flake8; - Upped libraries versions for the script for time survey; - More typing added. Refs: #176, #158.
1 parent 8db7da1 commit 612b5ab

File tree

27 files changed

+559
-192
lines changed

27 files changed

+559
-192
lines changed

.flake8

Lines changed: 0 additions & 9 deletions
This file was deleted.

.github/workflows/check_n_push_image.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@ jobs:
2828
with:
2929
python-version: 3.8
3030

31-
- name: Lint with flake8, isort and pyright
31+
- name: Lint with ruff and pyright
3232
run: |
3333
make substitute-sources
3434
pip install $(python3 setup.py --install-requirements)
35-
pip install -r docs/notebooks/requirements.txt
36-
pip install pre-commit==2.20.0
35+
pip install --requirement docs/notebooks/requirements.txt
36+
pip install pre-commit==3.4.0
3737
make pre-commit
3838
3939
docker-build-test-autotest:

.pre-commit-config.yaml

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,10 @@
11
default_language_version:
22
python: python3.8
33
repos:
4-
- repo: https://github.com/PyCQA/isort
5-
rev: 5.12.0
4+
- repo: https://github.com/astral-sh/ruff-pre-commit
5+
rev: v0.0.287
66
hooks:
7-
- id: isort
8-
- repo: https://github.com/PyCQA/flake8
9-
rev: 5.0.4
10-
hooks:
11-
- id: flake8
12-
additional_dependencies:
13-
- flake8-bugbear==22.8.23
14-
- flake8-comprehensions==3.10.0
15-
- flake8-simplify==0.19.3
16-
- mccabe==0.7.0
7+
- id: ruff
178
- repo: local
189
hooks:
1910
- id: pyright
@@ -22,4 +13,4 @@ repos:
2213
language: node
2314
pass_filenames: false
2415
types: [ python ]
25-
additional_dependencies: [ 'pyright@1.1.274' ]
16+
additional_dependencies: [ 'pyright@1.1.305' ]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
UTIL_VERSION := 0.3.5
1+
UTIL_VERSION := 0.3.6
22
UTIL_NAME := codeplag
33
PWD := $(shell pwd)
44

docs/notebooks/requirements.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
matplotlib~=3.6.1
2-
numpy~=1.23.1
3-
pandas~=1.4.3
4-
python-decouple~=3.6
5-
scipy~=1.9.3
1+
matplotlib~=3.7.3
2+
numpy~=1.23.5
3+
pandas~=2.0.3
4+
python-decouple~=3.8
5+
scipy~=1.10.1

docs/notebooks/utils.py

Lines changed: 90 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,38 +2,41 @@
22
import re
33
from datetime import datetime
44
from time import perf_counter
5+
from typing import Literal, Optional
56

67
import matplotlib.pyplot as plt
78
import numpy as np
89
import pandas as pd
9-
from decouple import Config, RepositoryEnv
10-
from scipy.optimize import curve_fit
11-
1210
from codeplag.algorithms.featurebased import counter_metric, struct_compare
1311
from codeplag.algorithms.stringbased import gst
1412
from codeplag.algorithms.tokenbased import value_jakkar_coef
1513
from codeplag.pyplag.utils import get_ast_from_content, get_features_from_ast
14+
from decouple import Config, RepositoryEnv
15+
from scipy.optimize import curve_fit
1616
from webparsers.github_parser import GitHubParser
1717

1818

19-
def square_func(x, a, b, c):
19+
def square_func(x: float, a: float, b: float, c: float) -> float:
2020
return a * x**2 + b * x + c
2121

2222

23-
def cube_func(x, a, b, c, d):
23+
def cube_func(x: float, a: float, b: float, c: float, d: float) -> float:
2424
return a * x**3 + b * x**2 + c * x + d
2525

2626

27-
def quart_func(x, a, b, c, d, e):
27+
def quart_func(x: float, a: float, b: float, c: float, d: float, e: float) -> float:
2828
return a * x**4 + b * x**3 + c * x**2 + d * x + e
2929

3030

31-
def remove_unnecessary_blank_lines(source_code):
31+
def remove_unnecessary_blank_lines(source_code: str) -> str:
3232
pattern = r"\n+"
3333
return re.sub(pattern, "\n", source_code)
3434

3535

36-
def get_data_from_dir(path='./data', max_count_lines=None):
36+
def get_data_from_dir(
37+
path: str = './data',
38+
max_count_lines: Optional[int] = None
39+
) -> pd.DataFrame:
3740
df = pd.DataFrame()
3841
for filename in os.listdir(path):
3942
if not re.search(r'.csv$', filename):
@@ -48,7 +51,7 @@ def get_data_from_dir(path='./data', max_count_lines=None):
4851
return df
4952

5053

51-
def save_works_from_repo_url(url, check_policy=True):
54+
def save_works_from_repo_url(url: str, check_policy: bool = True) -> None:
5255
current_repo_name = url.split('/')[-1]
5356
env_config = Config(RepositoryEnv('../../.env'))
5457
gh = GitHubParser(
@@ -57,7 +60,7 @@ def save_works_from_repo_url(url, check_policy=True):
5760
access_token=env_config.get('ACCESS_TOKEN')
5861
)
5962
files = list(gh.get_files_generator_from_repo_url(url))
60-
files = [(remove_unnecessary_blank_lines(file[0]), file[1]) for file in files]
63+
files = [(remove_unnecessary_blank_lines(file.code), file.link) for file in files]
6164

6265
df = pd.DataFrame(
6366
{
@@ -66,18 +69,25 @@ def save_works_from_repo_url(url, check_policy=True):
6669
'extension': ['py'] * (len(files) - 1),
6770
'repo_name': [current_repo_name] * (len(files) - 1),
6871
'content_len': [len(file_[0]) for file_ in files[:-1]],
69-
'content_len_without_blank': [len(file_[0].replace(' ', '').replace('\n', '').replace('\t', '')) for file_ in files[:-1]],
70-
'count_lines_without_blank_lines': [len(file_[0].splitlines()) for file_ in files[:-1]]
72+
'content_len_without_blank': [
73+
len(file_[0].replace(' ', '').replace('\n', '').replace('\t', ''))
74+
for file_ in files[:-1]
75+
],
76+
'count_lines_without_blank_lines': [
77+
len(file_[0].splitlines()) for file_ in files[:-1]
78+
]
7179
}
7280
)
7381
df = df[df['count_lines_without_blank_lines'] > 5]
7482
df.to_csv(os.path.join('./data/', current_repo_name + '.csv'), sep=';')
7583

7684

77-
def get_time_to_meta(df, iterations=10):
85+
def get_time_to_meta(df: pd.DataFrame, iterations: int = 10) -> pd.DataFrame:
7886
count_lines = []
7987
to_meta_time = []
80-
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
88+
for (index, content) in df[
89+
['content', 'link', 'count_lines_without_blank_lines']
90+
].iterrows():
8191
print(index, " " * 20, end='\r')
8292
for _ in range(iterations):
8393
tree = get_ast_from_content(content[0], content[1])
@@ -102,8 +112,14 @@ def get_time_to_meta(df, iterations=10):
102112
return output
103113

104114

105-
def plot_and_save_result(df, xlabel, ylabel, title, what,
106-
trend='linear'):
115+
def plot_and_save_result(
116+
df: pd.DataFrame,
117+
xlabel: str,
118+
ylabel: str,
119+
title: str,
120+
what: str,
121+
trend: Literal['linear', 'n^2', 'n^3', 'n^4'] = 'linear'
122+
) -> None:
107123
# Simple Moving average
108124
unique_count_lines = np.unique(df.count_lines)
109125
mean_times = []
@@ -122,19 +138,50 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
122138
if trend == 'linear':
123139
z = np.polyfit(unique_count_lines, mean_times, 1)
124140
p = np.poly1d(z)
125-
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.')
141+
plt.plot(
142+
unique_count_lines, p(unique_count_lines), "r--", label='Линейный тренд.'
143+
)
126144
elif trend == 'n^2':
127-
popt_cons, _ = curve_fit(square_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100]))
145+
popt_cons, _ = curve_fit(
146+
square_func,
147+
unique_count_lines,
148+
mean_times,
149+
bounds=([-np.inf, 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100])
150+
)
128151
p = np.poly1d(popt_cons)
129-
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Квадратичный тренд.')
152+
plt.plot(
153+
unique_count_lines,
154+
p(unique_count_lines),
155+
"r--", label='Квадратичный тренд.'
156+
)
130157
elif trend == 'n^3':
131-
popt_cons, _ = curve_fit(cube_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
158+
popt_cons, _ = curve_fit(
159+
cube_func,
160+
unique_count_lines,
161+
mean_times,
162+
bounds=([-np.inf, 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100])
163+
)
132164
p = np.poly1d(popt_cons)
133-
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='Кубический тренд.')
165+
plt.plot(
166+
unique_count_lines,
167+
p(unique_count_lines),
168+
"r--",
169+
label='Кубический тренд.'
170+
)
134171
elif trend == 'n^4':
135-
popt_cons, _ = curve_fit(quart_func, unique_count_lines, mean_times, bounds=([-np.inf, 0., 0., 0., 0.], [np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]))
172+
popt_cons, _ = curve_fit(
173+
quart_func,
174+
unique_count_lines,
175+
mean_times,
176+
bounds=(
177+
[-np.inf, 0., 0., 0., 0.],
178+
[np.inf, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100, 0.1 ** 100]
179+
)
180+
)
136181
p = np.poly1d(popt_cons)
137182
plt.plot(unique_count_lines, p(unique_count_lines), "r--", label='n^4.')
183+
else:
184+
raise Exception(f"Incorrect tred '{trend}'.")
138185

139186
rolling = pd.DataFrame(
140187
{
@@ -143,24 +190,40 @@ def plot_and_save_result(df, xlabel, ylabel, title, what,
143190
}
144191
)
145192
num_window = 20
146-
plt.plot(rolling.unique_count_lines, rolling.mean_times.rolling(window=num_window).mean(), label=f'Скользящее среднее по {num_window}ти замерам.')
193+
plt.plot(
194+
rolling.unique_count_lines,
195+
rolling.mean_times.rolling(window=num_window).mean(),
196+
label=f'Скользящее среднее по {num_window}ти замерам.'
197+
)
147198

148199
plt.ylabel(ylabel, fontsize=15)
149200
plt.xlabel(xlabel, fontsize=15)
150201
plt.title(title, fontsize=17)
151202
plt.legend(loc='upper left')
152-
plt.savefig('./graphics/need_time_{}_{}.png'.format(what, datetime.now().strftime("%d%m%Y_%H%M%S")))
203+
plt.savefig(
204+
'./graphics/need_time_{}_{}.png'.format(
205+
what,
206+
datetime.now().strftime("%d%m%Y_%H%M%S")
207+
)
208+
)
153209

154210

155-
def get_time_algorithms(df, work, iterations=5, metric='fast'):
211+
def get_time_algorithms(
212+
df: pd.DataFrame,
213+
work,
214+
iterations: int = 5,
215+
metric: Literal['fast', 'gst', 'structure'] = 'fast'
216+
) -> pd.DataFrame:
156217
count_lines = []
157218
times = []
158219
tree1 = get_ast_from_content(work.content, work.link)
159220
if tree1 is None:
160221
raise Exception("Unexpected error when parsing first work.")
161222

162223
features1 = get_features_from_ast(tree1, work.link)
163-
for (index, content) in df[['content', 'link', 'count_lines_without_blank_lines']].iterrows():
224+
for (index, content) in df[
225+
['content', 'link', 'count_lines_without_blank_lines']
226+
].iterrows():
164227
for _ in range(iterations):
165228
print(index, " " * 20, end='\r')
166229
tree2 = get_ast_from_content(content[0], content[1])
@@ -190,8 +253,7 @@ def get_time_algorithms(df, work, iterations=5, metric='fast'):
190253
end = perf_counter() - start
191254
times.append(end)
192255
else:
193-
print('Incorrect metric!')
194-
return 1
256+
raise Exception('Incorrect metric!')
195257

196258
count_lines.append(content[2])
197259

pyproject.toml

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,22 @@
1-
[tool.isort]
2-
profile = "black"
1+
[project]
2+
requires-python = ">=3.8"
33

4-
[tool.black]
5-
target-version = ["py38"]
4+
[tool.ruff]
5+
line-length = 97
6+
select = [
7+
"F", # pyflakes
8+
"E", # pycodestyle Error
9+
"W", # pycodestyle Warning
10+
"I", # isort
11+
"B", # flake8-bugbear
12+
"C4", # flake8-comprehensions
13+
"SIM", # flake8-simplify
14+
# "ERA", # eradicate
15+
"C90", # mccabe
16+
]
17+
18+
[tool.ruff.mccabe]
19+
max-complexity = 13
620

721
[tool.pyright]
822
pythonVersion = "3.8"

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
INSTALL_REQUIREMENTS = [
88
'argcomplete~=2.0.0',
9-
'numpy~=1.23.1',
9+
'numpy~=1.23.5',
1010
'pandas~=1.4.3',
1111
'ccsyspath~=1.1.0',
1212
'clang~=16.0.1.1',

src/codeplag/algorithms/featurebased.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ def counter_metric(counter1: dict, counter2: dict) -> float:
1717
return 1.0
1818

1919
percent_of_same = [0, 0]
20-
for key in counter1.keys():
20+
for key in counter1:
2121
if key not in counter2:
2222
percent_of_same[1] += counter1[key]
2323
continue
2424
percent_of_same[0] += min(counter1[key],
2525
counter2[key])
2626
percent_of_same[1] += max(counter1[key],
2727
counter2[key])
28-
for key in counter2.keys():
28+
for key in counter2:
2929
if key not in counter1:
3030
percent_of_same[1] += counter2[key]
3131
continue

src/codeplag/codeplagcli.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@
66
from pathlib import Path
77
from typing import List, Optional
88

9-
from codeplag.consts import EXTENSION_CHOICE, MODE_CHOICE, UTIL_NAME, UTIL_VERSION
109
from webparsers.types import GitHubContentUrl
1110

11+
from codeplag.consts import (
12+
EXTENSION_CHOICE,
13+
MODE_CHOICE,
14+
REPORTS_EXTENSION_CHOICE,
15+
UTIL_NAME,
16+
UTIL_VERSION,
17+
)
18+
1219

1320
class CheckUniqueStore(argparse.Action):
1421
"""Checks that the list of arguments contains no duplicates, then stores"""
@@ -116,6 +123,13 @@ def __init__(self):
116123
metavar="DIRECTORY",
117124
type=DirPath,
118125
)
126+
settings_modify.add_argument(
127+
"-re",
128+
"--reports_extension",
129+
help="Extension of saved report files.",
130+
type=str,
131+
choices=REPORTS_EXTENSION_CHOICE
132+
)
119133
settings_modify.add_argument(
120134
"-sp",
121135
"--show_progress",

0 commit comments

Comments
 (0)