Skip to content

Commit c19acd3

Browse files
authored
feat: adds the possibility to create a report from MongoDB. (#233)
1 parent 55aba62 commit c19acd3

File tree

19 files changed

+509
-423
lines changed

19 files changed

+509
-423
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ default_language_version:
22
python: python3.10
33
repos:
44
- repo: https://github.com/astral-sh/ruff-pre-commit
5-
rev: v0.9.7
5+
rev: v0.12.1
66
hooks:
77
- id: ruff
88
args: [ --fix ]

docker/compose.yml

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
1-
services:
2-
mongodb:
3-
image: mongo:8.0
4-
restart: unless-stopped
5-
container_name: mongodb
6-
environment:
7-
MONGO_INITDB_ROOT_USERNAME: root
8-
MONGO_INITDB_ROOT_PASSWORD: example
9-
volumes:
10-
- mongodb_data:/data/db
11-
ports:
12-
- "127.0.0.1:27017:27017"
13-
networks:
14-
- codeplag-network
15-
healthcheck:
16-
test: [ "CMD", "mongosh", "--eval", "db.adminCommand('ping')" ]
17-
interval: 5s
18-
timeout: 5s
19-
retries: 3
20-
start_period: 5s
21-
22-
volumes:
23-
mongodb_data: {}
24-
25-
networks:
26-
codeplag-network:
27-
driver: bridge
1+
services:
2+
mongodb:
3+
image: mongo:8.0
4+
restart: unless-stopped
5+
container_name: mongodb
6+
environment:
7+
MONGO_INITDB_ROOT_USERNAME: root
8+
MONGO_INITDB_ROOT_PASSWORD: example
9+
volumes:
10+
- mongodb_data:/data/db
11+
ports:
12+
- "127.0.0.1:27017:27017"
13+
networks:
14+
- codeplag-network
15+
healthcheck:
16+
test: [ "CMD", "mongosh", "--eval", "db.adminCommand('ping')" ]
17+
interval: 5s
18+
timeout: 5s
19+
retries: 3
20+
start_period: 5s
21+
22+
volumes:
23+
mongodb_data: {}
24+
25+
networks:
26+
codeplag-network:
27+
driver: bridge

src/codeplag/algorithms/compare.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""This module consists of complex algorithms for comparing two works."""
22

3+
from datetime import datetime
4+
35
import numpy as np
46

57
from codeplag.algorithms.featurebased import counter_metric, struct_compare
@@ -17,8 +19,8 @@
1719

1820

1921
def fast_compare(
20-
features_f: ASTFeatures,
21-
features_s: ASTFeatures,
22+
features1: ASTFeatures,
23+
features2: ASTFeatures,
2224
ngrams_length: NgramsLength = DEFAULT_NGRAMS_LENGTH,
2325
weights: tuple[float, float, float, float] = DEFAULT_WEIGHTS,
2426
) -> FastCompareInfo:
@@ -29,21 +31,21 @@ def fast_compare(
2931
3032
Args:
3133
----
32-
features_f: The features of the first source file.
33-
features_s: The features of the second source file.
34+
features1 (ASTFeatures): The features of the first source file.
35+
features2 (ASTFeatures): The features of the second source file.
3436
ngrams_length (NgramsLength): N-grams length.
3537
weights: Weights of fast metrics that participate in
3638
counting total similarity coefficient.
3739
3840
"""
3941
jakkar_coef = value_jakkar_coef(
40-
tokens_first=features_f.tokens,
41-
tokens_second=features_s.tokens,
42+
tokens_first=features1.tokens,
43+
tokens_second=features2.tokens,
4244
ngrams_length=ngrams_length,
4345
)
44-
ops_res = counter_metric(features_f.operators, features_s.operators)
45-
kw_res = counter_metric(features_f.keywords, features_s.keywords)
46-
lits_res = counter_metric(features_f.literals, features_s.literals)
46+
ops_res = counter_metric(features1.operators, features2.operators)
47+
kw_res = counter_metric(features1.keywords, features2.keywords)
48+
lits_res = counter_metric(features1.literals, features2.literals)
4749
weighted_average = np.average(
4850
np.array([jakkar_coef, ops_res, kw_res, lits_res]), weights=weights
4951
)
@@ -86,12 +88,11 @@ def compare_works(
8688
metric anywhere (FullCompareInfo).
8789
8890
"""
89-
fast_compare_info = fast_compare(
90-
features_f=features1, features_s=features2, ngrams_length=ngrams_length
91-
)
91+
fast_compare_info = fast_compare(features1, features2, ngrams_length)
9292
if threshold and (fast_compare_info.weighted_average * 100.0) < threshold:
9393
return fast_compare_info
9494

95+
features1, features2 = sorted([features1, features2])
9596
compliance_matrix = np.empty(
9697
(len(features1.head_nodes), len(features2.head_nodes), 2), dtype=np.int64
9798
)
@@ -106,4 +107,16 @@ def compare_works(
106107
similarity=struct_res, compliance_matrix=compliance_matrix
107108
)
108109

109-
return FullCompareInfo(fast=fast_compare_info, structure=structure_info)
110+
return FullCompareInfo(
111+
date=datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
112+
first_heads=features1.head_nodes,
113+
first_modify_date=features1.modify_date,
114+
first_sha256=features1.sha256,
115+
first_path=features1.filepath,
116+
second_heads=features2.head_nodes,
117+
second_modify_date=features2.modify_date,
118+
second_sha256=features2.sha256,
119+
second_path=features2.filepath,
120+
fast=fast_compare_info,
121+
structure=structure_info,
122+
)

src/codeplag/db/mongo.py

Lines changed: 43 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
"""MIT License.
22
3-
Written 2025 by Stepan Pahomov, Daniil Lokosov
3+
Written 2025 by Stepan Pahomov, Daniil Lokosov, Artyom Semidolin.
44
"""
55

66
import atexit
7-
from datetime import datetime
8-
from typing import Final, NamedTuple
7+
from pathlib import Path
8+
from typing import Final
99

1010
from pymongo import MongoClient
1111
from pymongo.collection import Collection
@@ -29,11 +29,11 @@
2929
deserialize_compare_result_from_dict,
3030
serialize_compare_result_to_dict,
3131
)
32-
from codeplag.types import ASTFeatures, FullCompareInfo
32+
from codeplag.types import ASTFeatures, FullCompareInfo, Settings
3333

3434

3535
class MongoDBConnection:
36-
DB_NAME: Final[str] = f"{UTIL_NAME}_cache"
36+
DB_NAME: Final = f"{UTIL_NAME}_cache"
3737

3838
def __init__(
3939
self: Self,
@@ -56,22 +56,34 @@ def __init__(
5656
self.password: str = password
5757
self.url: str = f"mongodb://{user}:{password}@{host}:{port}/"
5858

59-
# Connecting to MongoDB
6059
try:
6160
self.client = MongoClient(self.url, serverSelectionTimeoutMS=3000)
62-
self.client.admin.command("ping") # Checking the connection
61+
self.client.admin.command("ping")
6362
except ConnectionFailure as err:
6463
logger.error("Failed to connect to MongoDB: %s", err)
6564
raise Exception(
6665
"Can't connect to MongoDB with selected 'mongo'. Check your settings. "
6766
"Please note if the application is running in Docker, the host may change."
6867
) from err
69-
logger.debug("Successfully connected to MongoDB!")
68+
logger.debug("Successfully connected to the MongoDB.")
7069
self.db = self.client[self.DB_NAME]
7170

7271
# Registering the disconnect method for execution upon program termination
7372
atexit.register(self.disconnect)
7473

74+
@classmethod
75+
def from_settings(
76+
cls: type["MongoDBConnection"], settings_conf: Settings
77+
) -> "MongoDBConnection":
78+
host = settings_conf.get("mongo_host", DEFAULT_MONGO_HOST)
79+
port = settings_conf.get("mongo_port", DEFAULT_MONGO_PORT)
80+
user = settings_conf.get("mongo_user", DEFAULT_MONGO_USER)
81+
password = settings_conf.get("mongo_pass")
82+
if password is None:
83+
raise ValueError("'mongo' reports_exception provided, but 'mongo-pass' is missing")
84+
85+
return cls(host=host, port=port, user=user, password=password)
86+
7587
def disconnect(self: Self) -> None:
7688
"""Close the connection to MongoDB.
7789
@@ -99,16 +111,7 @@ def clear_db(self: Self) -> None:
99111

100112

101113
class ReportRepository:
102-
class CompareInfoDocument(NamedTuple):
103-
"""Compare Info Document structure."""
104-
105-
first_sha256: str
106-
second_sha256: str
107-
first_modify_date: datetime
108-
second_modify_date: datetime
109-
compare_info: FullCompareInfo
110-
111-
COLLECTION_NAME: str = "compare_info"
114+
COLLECTION_NAME: Final = "compare_info"
112115

113116
def __init__(self: Self, mongo_connection: MongoDBConnection) -> None:
114117
"""Initialization of the repository for the compare_info collection."""
@@ -119,83 +122,57 @@ def __init__(self: Self, mongo_connection: MongoDBConnection) -> None:
119122
self.collection: Collection = collection
120123

121124
def get_compare_info(
122-
self: Self, work1: ASTFeatures, work2: ASTFeatures
123-
) -> CompareInfoDocument | None:
125+
self: Self, first_filepath: str | Path, second_filepath: str | Path
126+
) -> FullCompareInfo | None:
124127
"""Retrieve comparison result between two files from the compare_info collection.
125128
126-
The document is identified by sorted file paths:
127-
_id = {"first": min(filepath), "second": max(filepath)}.
129+
The document is identified by sorted file paths.
130+
128131
Returns None if SHA-256 hashes of either file do not match stored values.
129132
130133
Args:
131-
work1 (ASTFeatures): First file metadata.
132-
work2 (ASTFeatures): Second file metadata.
134+
first_filepath (str | Path): First filepath.
135+
second_filepath (str | path): Second filepath.
133136
134137
Returns:
135-
ReportType | None: Deserialized comparison result if found and valid.
138+
FullCompareInfo | None: Deserialized comparison result if found and valid.
136139
"""
137140
# Sort works by filepath to form the unique key
138-
work1, work2 = sorted([work1, work2])
139-
first_path, second_path = [str(work1.filepath), str(work2.filepath)]
141+
first_path, second_path = sorted([str(first_filepath), str(second_filepath)])
140142
document_id = {"first": first_path, "second": second_path}
141-
142-
# Find document in collection
143143
document = self.collection.find_one({"_id": document_id})
144144
if not document:
145145
logger.trace("No compare_info found for file path: (%s, %s)", first_path, second_path) # type: ignore
146146
return None
147147
logger.trace("Compare_info found for file path: (%s, %s)", first_path, second_path) # type: ignore
148148

149-
# Deserialize and return compare_info
150-
compare_info = deserialize_compare_result_from_dict(document["compare_info"])
151-
return self.CompareInfoDocument(
152-
first_sha256=document["first_sha256"],
153-
second_sha256=document["second_sha256"],
154-
first_modify_date=document["first_modify_date"],
155-
second_modify_date=document["second_modify_date"],
156-
compare_info=compare_info,
157-
)
149+
return deserialize_compare_result_from_dict(document)
158150

159-
def write_compare_info(
160-
self: Self, work1: ASTFeatures, work2: ASTFeatures, compare_info: FullCompareInfo
161-
) -> None:
151+
def write_compare_info(self: Self, compare_info: FullCompareInfo) -> None:
162152
"""Insert or update a document in the compare_info collection.
163153
164154
The primary key (_id) is formed as a dictionary with sorted file paths.
165155
166156
Args:
167-
work1 (ASTFeatures): The first file for comparison.
168-
work2 (ASTFeatures): The second file for comparison.
169157
compare_info (CompareInfo): Information about the comparison results.
170158
"""
171-
# Sorting paths to create a unique primary key
172-
work1, work2 = sorted([work1, work2])
173-
first_path, second_path = [str(work1.filepath), str(work2.filepath)]
174-
175-
# Forming _id as a string of sorted paths
176-
document_id = {"first": first_path, "second": second_path}
177-
178-
# Using the serialize_compare_result_to_dict function to convert data
179-
serialized_compare_info = serialize_compare_result_to_dict(compare_info)
180-
181-
document = {
182-
"_id": document_id,
183-
"first_sha256": work1.sha256,
184-
"second_sha256": work2.sha256,
185-
"first_modify_date": work1.modify_date,
186-
"second_modify_date": work2.modify_date,
187-
"compare_info": serialized_compare_info,
159+
document_id = {
160+
"first": str(compare_info.first_path),
161+
"second": str(compare_info.second_path),
188162
}
163+
document = {"_id": document_id, **serialize_compare_result_to_dict(compare_info)}
189164

190165
# Insert or update the document
191166
self.collection.update_one({"_id": document_id}, {"$set": document}, upsert=True)
192167
logger.trace( # type: ignore
193-
"Document for (%s, %s) successfully inserted/updated.", first_path, second_path
168+
"Document for (%s, %s) successfully inserted/updated.",
169+
compare_info.first_path,
170+
compare_info.second_path,
194171
)
195172

196173

197174
class FeaturesRepository:
198-
COLLECTION_NAME: str = "features"
175+
COLLECTION_NAME: Final = "features"
199176

200177
def __init__(self: Self, mongo_connection: MongoDBConnection) -> None:
201178
"""Initialization of the repository for the features collection."""
@@ -260,21 +237,14 @@ class MongoReporter(AbstractReporter):
260237
def __init__(self: Self, repository: ReportRepository) -> None:
261238
self.repository = repository
262239

263-
def save_result(
264-
self: Self,
265-
first_work: ASTFeatures,
266-
second_work: ASTFeatures,
267-
compare_info: FullCompareInfo,
268-
) -> None:
240+
def save_result(self: Self, compare_info: FullCompareInfo) -> None:
269241
"""Updates the cache with new comparisons and writes it to the MongoDB.
270242
271243
Args:
272-
first_work (ASTFeatures): Contains the first work metadata.
273-
second_work (ASTFeatures): Contains the second work metadata.
274244
compare_info (CompareInfo): Contains information about comparisons
275245
between the first and second works.
276246
"""
277-
self.repository.write_compare_info(first_work, second_work, compare_info)
247+
self.repository.write_compare_info(compare_info)
278248

279249
def get_result(
280250
self: Self,
@@ -287,14 +257,14 @@ def get_result(
287257
work1 (ASTFeatures): Contains the first work metadata.
288258
work2 (ASTFeatures): Contains the second work metadata.
289259
"""
290-
cache_val = self.repository.get_compare_info(work1, work2)
260+
cache_val = self.repository.get_compare_info(work1.filepath, work2.filepath)
291261

292262
if (
293263
cache_val
294264
and cache_val.first_sha256 == work1.sha256
295265
and cache_val.second_sha256 == work2.sha256
296266
):
297-
return cache_val.compare_info
267+
return cache_val
298268
else:
299269
return None
300270

0 commit comments

Comments
 (0)