Skip to content

Commit 2765e51

Browse files
committed
feat: adds possibility to create report from MongoDB with provided paths. (#234)
1 parent a9a419d commit 2765e51

File tree

5 files changed

+106
-102
lines changed

5 files changed

+106
-102
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
UTIL_VERSION := 0.5.23
1+
UTIL_VERSION := 0.5.24
22
UTIL_NAME := codeplag
33
PWD := $(shell pwd)
44

src/codeplag/handlers/report.py

Lines changed: 62 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""This module contains handlers for the report command of the CLI."""
22

3+
import re
34
from collections import defaultdict
45
from copy import deepcopy
56
from pathlib import Path
6-
from typing import Generator, Literal, TypedDict
7+
from typing import Callable, Generator, Literal, TypedDict
78

89
import jinja2
910
import numpy as np
@@ -94,9 +95,10 @@ def html_report_create(
9495
f"There is nothing in '{reports_path}' to create a basic html report from."
9596
)
9697
return ExitCode.EXIT_INVAL
97-
return __html_report_create_from_csv(
98+
df = read_df(reports_path)
99+
return __html_report_create(
98100
report_path,
99-
reports_path,
101+
df,
100102
report_type,
101103
settings_config["threshold"],
102104
settings_config["language"],
@@ -106,7 +108,7 @@ def html_report_create(
106108
elif reports_extension == "mongo":
107109
connection = MongoDBConnection.from_settings(settings_config)
108110
compare_info_repo = ReportRepository(connection)
109-
exit_code = __html_report_create_from_mongo(
111+
exit_code = __html_report_create(
110112
report_path,
111113
compare_info_repo,
112114
report_type,
@@ -126,7 +128,9 @@ def html_report_create(
126128

127129

128130
def calculate_general_total_similarity(
129-
df: pd.DataFrame, unique_first_paths: NDArray, unique_second_paths: NDArray
131+
compare_infos: pd.DataFrame | ReportRepository,
132+
unique_first_paths: NDArray,
133+
unique_second_paths: NDArray,
130134
) -> float:
131135
total_similarity = 0.0
132136
if unique_first_paths.size == 0:
@@ -135,13 +139,24 @@ def calculate_general_total_similarity(
135139
max_similarity = 0.0
136140
for second_path in unique_second_paths:
137141
sorted_paths = sorted([first_path, second_path])
138-
selected = df[
139-
(df["first_path"].str.startswith(sorted_paths[0])) # type: ignore
140-
& (df["second_path"].str.startswith(sorted_paths[1])) # type: ignore
141-
]
142-
if selected is None or selected.size == 0:
143-
continue
144-
module_similarity = float(selected.iloc[0]["weighted_average"])
142+
if isinstance(compare_infos, ReportRepository):
143+
selected = compare_infos.collection.find_one(
144+
{
145+
"first_path": re.compile(rf"{sorted_paths[0]}[/.\w]*"),
146+
"second_path": re.compile(rf"{sorted_paths[1]}[/.\w]*"),
147+
}
148+
)
149+
if selected is None:
150+
continue
151+
module_similarity = selected["compare_result"]["fast"]["weighted_average"]
152+
else:
153+
selected = compare_infos[
154+
(compare_infos["first_path"].str.startswith(sorted_paths[0])) # type: ignore
155+
& (compare_infos["second_path"].str.startswith(sorted_paths[1])) # type: ignore
156+
]
157+
if selected is None or selected.size == 0:
158+
continue
159+
module_similarity = float(selected.iloc[0]["weighted_average"])
145160
if module_similarity > max_similarity:
146161
max_similarity = module_similarity
147162
total_similarity += max_similarity
@@ -227,15 +242,14 @@ def _get_same_funcs(
227242

228243
def _get_parsed_line(
229244
compare_results: pd.DataFrame | ReportRepository,
245+
extract_func: Callable,
230246
threshold: int = DEFAULT_THRESHOLD,
231247
include_funcs_less_threshold: bool = True,
232248
) -> Generator[tuple[FullCompareInfo, SameFuncs, SameFuncs], None, None]:
233249
if isinstance(compare_results, ReportRepository):
234-
extract_func = lambda: compare_results.collection.find({}) # noqa: E731
235250
handle_result_func = lambda result: result # noqa: E731
236251
deserialize_func = deserialize_compare_result_from_dict
237252
else:
238-
extract_func = compare_results.iterrows
239253
handle_result_func = lambda result: result[1] # noqa: E731
240254
deserialize_func = deserialize_compare_result
241255
for result in extract_func():
@@ -280,12 +294,14 @@ def _get_resulting_same_percentages(
280294

281295

282296
def _search_sources(
283-
compare_results: pd.DataFrame | ReportRepository, threshold: int = DEFAULT_THRESHOLD
297+
compare_results: pd.DataFrame | ReportRepository,
298+
extract_func: Callable,
299+
threshold: int = DEFAULT_THRESHOLD,
284300
) -> tuple[SamePartsOfAll, CntHeadNodes]:
285301
same_parts_of_all: SamePartsOfAll = defaultdict(lambda: {})
286302
cnt_head_nodes: CntHeadNodes = {}
287303
for compare_info, same_parts_of_second, same_parts_of_first in _get_parsed_line(
288-
compare_results, threshold, include_funcs_less_threshold=False
304+
compare_results, extract_func, threshold, include_funcs_less_threshold=False
289305
):
290306
for path, heads in zip(
291307
(compare_info.first_path, compare_info.second_path),
@@ -323,6 +339,7 @@ def _search_sources(
323339

324340
def _create_general_report(
325341
compare_results: pd.DataFrame | ReportRepository,
342+
extract_func: Callable,
326343
save_path: Path,
327344
environment: jinja2.Environment,
328345
threshold: Threshold = DEFAULT_THRESHOLD,
@@ -331,13 +348,13 @@ def _create_general_report(
331348
) -> None:
332349
if paths is not None:
333350
if isinstance(compare_results, ReportRepository):
334-
raise NotImplementedError(
335-
"Creating general html report with MongoDB with provided paths is not implemented."
336-
)
337-
unique_first_paths = pd.unique(compare_results["first_path"])
338-
unique_second_paths = pd.unique(compare_results["second_path"])
339-
assert isinstance(unique_first_paths, np.ndarray)
340-
assert isinstance(unique_second_paths, np.ndarray)
351+
unique_first_paths = np.array(extract_func().distinct("first_path"))
352+
unique_second_paths = np.array(extract_func().distinct("second_path"))
353+
else:
354+
unique_first_paths = pd.unique(compare_results["first_path"])
355+
unique_second_paths = pd.unique(compare_results["second_path"])
356+
assert isinstance(unique_first_paths, np.ndarray)
357+
assert isinstance(unique_second_paths, np.ndarray)
341358
first_root_path_sim = calculate_general_total_similarity(
342359
compare_results, unique_first_paths, unique_second_paths
343360
)
@@ -352,7 +369,7 @@ def _create_general_report(
352369
save_path = save_path / DEFAULT_GENERAL_REPORT_NAME
353370
save_path.write_text(
354371
template.render(
355-
data=_get_parsed_line(compare_results),
372+
data=_get_parsed_line(compare_results, extract_func),
356373
list=list,
357374
len=len,
358375
round=round,
@@ -367,13 +384,14 @@ def _create_general_report(
367384

368385
def _create_sources_report(
369386
compare_results: pd.DataFrame | ReportRepository,
387+
extract_func: Callable,
370388
save_path: Path,
371389
environment: jinja2.Environment,
372390
threshold: Threshold = DEFAULT_THRESHOLD,
373391
language: Language = DEFAULT_LANGUAGE,
374392
paths: tuple[str, str] | None = None,
375393
) -> None:
376-
data, cnt_head_nodes = _search_sources(compare_results, threshold)
394+
data, cnt_head_nodes = _search_sources(compare_results, extract_func, threshold)
377395
same_percentages = _get_resulting_same_percentages(data, cnt_head_nodes)
378396
if paths is not None:
379397
first_root_path_sim = calculate_sources_total_similarity(same_percentages, paths[0])
@@ -402,9 +420,9 @@ def _create_sources_report(
402420
)
403421

404422

405-
def __html_report_create_from_mongo(
423+
def __html_report_create(
406424
report_path: Path,
407-
compare_info_repo: ReportRepository,
425+
compare_infos: pd.DataFrame | ReportRepository,
408426
report_type: ReportType,
409427
threshold: Threshold,
410428
language: Language,
@@ -421,48 +439,30 @@ def __html_report_create_from_mongo(
421439
if not all_paths_provided and any([first_root_path, second_root_path]):
422440
raise ValueError(_("All paths must be provided."))
423441

424-
environment = jinja2.Environment(extensions=["jinja2.ext.i18n"])
425-
environment.install_gettext_translations(get_translations()) # type: ignore
426-
create_report_function(
427-
compare_info_repo, # type:ignore
428-
report_path,
429-
environment,
430-
threshold,
431-
language,
432-
)
433-
return ExitCode.EXIT_SUCCESS
434-
435-
436-
def __html_report_create_from_csv(
437-
report_path: Path,
438-
reports_path: Path,
439-
report_type: ReportType,
440-
threshold: Threshold,
441-
language: Language,
442-
first_root_path: Path | str | None = None,
443-
second_root_path: Path | str | None = None,
444-
) -> Literal[ExitCode.EXIT_SUCCESS]:
445-
if report_type == "general":
446-
create_report_function = _create_general_report
447-
elif report_type == "sources":
448-
create_report_function = _create_sources_report
449-
else:
450-
raise ValueError(_("Invalid report type."))
451-
all_paths_provided = all([first_root_path, second_root_path])
452-
if not all_paths_provided and any([first_root_path, second_root_path]):
453-
raise ValueError(_("All paths must be provided."))
454-
455-
df = read_df(reports_path)
456442
if all_paths_provided:
457443
paths = tuple(sorted([str(first_root_path), str(second_root_path)]))
458-
df = df[df["first_path"].str.startswith(paths[0])] # type: ignore
459-
df = df[df["second_path"].str.startswith(paths[1])] # type: ignore
444+
if isinstance(compare_infos, pd.DataFrame):
445+
compare_infos = compare_infos[compare_infos["first_path"].str.startswith(paths[0])] # type: ignore
446+
compare_infos = compare_infos[compare_infos["second_path"].str.startswith(paths[1])] # type: ignore
447+
extract_func = compare_infos.iterrows # type: ignore
448+
else:
449+
extract_func = lambda: compare_infos.collection.find( # noqa: E731
450+
{
451+
"first_path": re.compile(rf"{paths[0]}[/.\w]*"),
452+
"second_path": re.compile(rf"{paths[1]}[/.\w]*"),
453+
}
454+
)
460455
else:
461456
paths = None
457+
if isinstance(compare_infos, ReportRepository):
458+
extract_func = lambda: compare_infos.collection.find({}) # noqa: E731
459+
else:
460+
extract_func = compare_infos.iterrows
462461
environment = jinja2.Environment(extensions=["jinja2.ext.i18n"])
463462
environment.install_gettext_translations(get_translations()) # type: ignore
464463
create_report_function(
465-
df, # type:ignore
464+
compare_infos,
465+
extract_func,
466466
report_path,
467467
environment,
468468
threshold,

test/auto/functional/test_mongo.py

Lines changed: 33 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -101,40 +101,6 @@ def test_correct_mongo_connection(
101101
assert b"Successfully connected to the MongoDB" in result.cmd_res.stdout
102102

103103

104-
@pytest.mark.parametrize(
105-
"cmd, files, extension, found_plag",
106-
[
107-
("--files", PY_FILES, "py", False),
108-
("--files", PY_SIM_FILES, "py", True),
109-
("--files", CPP_FILES, "cpp", False),
110-
("--files", CPP_SIM_FILES, "cpp", True),
111-
("--github-files", PY_GITHUB_FILES, "py", False),
112-
("--github-files", CPP_GITHUB_SIM_FILES, "cpp", True),
113-
],
114-
)
115-
def test_saving_metadata_and_reports(
116-
cmd: str,
117-
files: Tuple[Path, Path],
118-
extension: str,
119-
found_plag: bool,
120-
mongo_connection: MongoDBConnection,
121-
clear_db: None,
122-
):
123-
features_repo = FeaturesRepository(mongo_connection)
124-
compare_info_repo = ReportRepository(mongo_connection)
125-
126-
run_check([cmd, *files], extension=extension)
127-
128-
for file in files:
129-
assert features_repo.get_features(ASTFeatures(file)) is not None
130-
compare_info = compare_info_repo.get_compare_info(files[0], files[1])
131-
132-
if found_plag:
133-
assert compare_info
134-
else:
135-
assert compare_info is None
136-
137-
138104
@pytest.mark.parametrize(
139105
("cmd", "files", "extension", "found_plag"),
140106
[
@@ -235,3 +201,36 @@ def test_saving_after_file_significant_change(
235201
assert write_cmp
236202
else:
237203
assert not write_cmp
204+
205+
206+
@pytest.mark.parametrize(
207+
"cmd, files, extension, found_plag",
208+
[
209+
("--files", PY_FILES, "py", False),
210+
("--files", PY_SIM_FILES, "py", True),
211+
("--files", CPP_FILES, "cpp", False),
212+
("--files", CPP_SIM_FILES, "cpp", True),
213+
("--github-files", PY_GITHUB_FILES, "py", False),
214+
("--github-files", CPP_GITHUB_SIM_FILES, "cpp", True),
215+
],
216+
)
217+
def test_saving_metadata_and_reports(
218+
cmd: str,
219+
files: Tuple[Path, Path],
220+
extension: str,
221+
found_plag: bool,
222+
mongo_connection: MongoDBConnection,
223+
):
224+
features_repo = FeaturesRepository(mongo_connection)
225+
compare_info_repo = ReportRepository(mongo_connection)
226+
227+
run_check([cmd, *files], extension=extension)
228+
229+
for file in files:
230+
assert features_repo.get_features(ASTFeatures(file)) is not None
231+
compare_info = compare_info_repo.get_compare_info(files[0], files[1])
232+
233+
if found_plag:
234+
assert compare_info
235+
else:
236+
assert compare_info is None

test/auto/functional/test_report.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,14 +131,19 @@ def test_content_different_between_calls(
131131
assert first_report_path.read_text() != second_report_path.read_text()
132132

133133
@pytest.mark.parametrize(
134-
"report_type",
135-
["general", "sources"],
134+
("report_type", "reports_extension"),
135+
[
136+
("general", "csv"),
137+
("sources", "mongo"),
138+
],
136139
)
137-
def test_default_report_diff_with_provided_paths(self: Self, report_type: ReportType) -> None:
140+
def test_default_report_diff_with_provided_paths(
141+
self: Self, report_type: ReportType, reports_extension: ReportsExtension
142+
) -> None:
138143
first_report_path = REPORTS_FOLDER / "report1.html"
139144
second_report_path = REPORTS_FOLDER / "report2.html"
140145

141-
modify_settings(reports_extension="csv")
146+
modify_settings(reports_extension=reports_extension)
142147
create_report(first_report_path, report_type).assert_success()
143148
create_report(
144149
second_report_path,

test/unit/codeplag/handlers/test_report.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def test__get_parsed_line(first_compare_result: FullCompareInfo) -> None:
172172
compare_df.iloc[0].first_heads = str(compare_df.iloc[0].first_heads)
173173
compare_df.iloc[0].second_heads = str(compare_df.iloc[0].second_heads)
174174

175-
result = list(_get_parsed_line(compare_df))
175+
result = list(_get_parsed_line(compare_df, compare_df.iterrows))
176176

177177
assert result[0][0].fast == first_compare_result.fast
178178
assert result[0][0].structure

0 commit comments

Comments
 (0)