Skip to content

Commit 7c61679

Browse files
[Feature:Plagiarism] common code & individual ranking (#28)
* Refactor: fix warnings in C++, make code more readable * Merge adjacent regions * Nightly progress * Reimplement algorithm * Implement save matches * Fix bugs * Add rankings * Minor changes * Final touches * Add "common code" functionality * Debug output files * Fix merging * Fix bugs in writing the files * Initial draft for individual student ranking * Fix compiler errors * Update ranking directory structure * Minor fix to the directory path * Another minor fix Co-authored-by: williamjallen <[email protected]>
1 parent 4243ad3 commit 7c61679

File tree

1 file changed

+200
-29
lines changed

1 file changed

+200
-29
lines changed

compare_hashes/compare_hashes.cpp

Lines changed: 200 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -54,26 +54,56 @@ class Submission {
5454
const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
5555
void addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location) {
5656
std::map<location_in_submission, std::set<HashLocation>>::iterator itr = suspicious_matches.find(location);
57-
57+
// TODO: is this if-else necessary? would this not work if we just did?: suspicious_matches[location].insert(matching_location);
5858
if (itr != suspicious_matches.end()) {
59-
// location already exists in the map, so we just append the location to the vector
59+
// location already exists in the map, so we just append the location to the set
6060
suspicious_matches[location].insert(matching_location);
6161
} else {
62-
// intialize the vector and add the location
62+
// intialize the set and add the location
6363
std::set<HashLocation> s;
6464
s.insert(matching_location);
6565
suspicious_matches[location] = s;
6666
}
67+
// update the students_matched container
68+
students_matched[matching_location.student][matching_location.version]++;
69+
}
70+
void addCommonMatch(location_in_submission location, const HashLocation &matching_location) {
71+
std::map<location_in_submission, std::set<HashLocation>>::iterator itr = common_matches.find(location);
72+
73+
if (itr != common_matches.end()) {
74+
// location already exists in the map, so we just append the location to the set
75+
common_matches[location].insert(matching_location);
76+
} else {
77+
// intialize the set and add the location
78+
std::set<HashLocation> s;
79+
s.insert(matching_location);
80+
common_matches[location] = s;
81+
}
6782
}
6883
const std::map<location_in_submission, std::set<HashLocation> >& getSuspiciousMatches() const {
6984
return suspicious_matches;
7085
}
86+
const std::map<location_in_submission, std::set<HashLocation> >& getCommonMatches() const {
87+
return common_matches;
88+
}
89+
const std::unordered_map<std::string, std::unordered_map<int, int> >& getStudentsMatched() const {
90+
return students_matched;
91+
}
92+
unsigned int getNumHashes() const { return hashes.size(); }
93+
float getPercentage() const {
94+
return (100.0 * (suspicious_matches.size() + common_matches.size())) / hashes.size();
95+
}
7196

7297
private:
7398
std::string student_;
7499
int version_;
75100
std::vector<std::pair<hash, location_in_submission> > hashes;
76101
std::map<location_in_submission, std::set<HashLocation> > suspicious_matches;
102+
std::map<location_in_submission, std::set<HashLocation> > common_matches;
103+
104+
// a container to keep track of all the students this submission
105+
// matched and the number of matching hashes per submission
106+
std::unordered_map<std::string, std::unordered_map<int, int> > students_matched;
77107
};
78108

79109

@@ -116,13 +146,23 @@ bool matchingPositionsAreAdjacent(const nlohmann::json &first, const nlohmann::j
116146
}
117147

118148

119-
// increments the end position for each of the matches in the json provided
149+
// increments the end position for each of the matches in the json provided,
150+
// merging overlapping regions where necessary
120151
void incrementEndPositionsForMatches(nlohmann::json &matches) {
121152
nlohmann::json::iterator itr = matches.begin();
122153
for (; itr != matches.end(); itr++) {
123154
nlohmann::json::iterator itr2 = (*itr)["matchingpositions"].begin();
124-
for (; itr2 != (*itr)["matchingpositions"].end(); itr2++) {
125-
(*itr2)["end"] = (*itr2)["end"].get<int>() + 1;
155+
nlohmann::json::iterator itr3 = ++((*itr)["matchingpositions"].begin());
156+
for (; itr3 != (*itr)["matchingpositions"].end();) {
157+
if ((*itr2)["end"].get<int>() >= (*itr3)["start"]) {
158+
(*itr2)["end"] = (*itr3)["end"].get<int>();
159+
itr3 = (*itr)["matchingpositions"].erase(itr3);
160+
}
161+
else {
162+
(*itr2)["end"] = (*itr2)["end"].get<int>() + 1;
163+
itr2++;
164+
itr3++;
165+
}
126166
}
127167
}
128168
}
@@ -174,8 +214,8 @@ int main(int argc, char* argv[]) {
174214
// ---------------------------------------------------------------------------
175215
// loop over all submissions and populate the all_hashes and all_submissions structures
176216

177-
// Stores all the hashes and their locations across all submissions
178-
std::unordered_map<hash, std::vector<HashLocation>> all_hashes;
217+
// Stores all the hashes and their locations across all submissions (sorted in "bins" of student names)
218+
std::unordered_map<hash, std::unordered_map<std::string, std::vector<HashLocation>>> all_hashes;
179219
// Stores all submissions
180220
std::vector<Submission> all_submissions;
181221

@@ -205,7 +245,7 @@ int main(int argc, char* argv[]) {
205245
int location = 0;
206246
while (istr >> input_hash) {
207247
location++;
208-
all_hashes[input_hash].push_back(HashLocation(username, version, location));
248+
all_hashes[input_hash][username].push_back(HashLocation(username, version, location));
209249
submission.addHash(input_hash, location);
210250
}
211251

@@ -230,12 +270,29 @@ int main(int argc, char* argv[]) {
230270
std::vector<std::pair<hash, location_in_submission>>::const_iterator hash_itr = submission_itr->getHashes().begin();
231271
for (; hash_itr != submission_itr->getHashes().end(); ++hash_itr) {
232272

233-
// look up that hash in the all_hashes table, and look for occurences of that hash in other submisions
234-
std::vector<HashLocation> occurences = all_hashes[hash_itr->first];
235-
std::vector<HashLocation>::iterator occurences_itr = occurences.begin();
273+
// look up that hash in the all_hashes table, loop over all other students that have the same hash
274+
std::unordered_map<std::string, std::vector<HashLocation>> occurences = all_hashes[hash_itr->first];
275+
std::unordered_map<std::string, std::vector<HashLocation>>::iterator occurences_itr = occurences.begin();
236276
for (; occurences_itr != occurences.end(); ++occurences_itr) {
237-
if (occurences_itr->student != submission_itr->student()) {
238-
submission_itr->addSuspiciousMatch(hash_itr->second, *occurences_itr);
277+
278+
// don't look for matches across submissions of the same student
279+
if (occurences_itr->first == submission_itr->student()) {
280+
continue;
281+
}
282+
283+
// save the locations of all other occurences of the matching hash in other students' submissions
284+
std::vector<HashLocation>::iterator itr = occurences_itr->second.begin();
285+
for (; itr != occurences_itr->second.end(); ++itr) {
286+
287+
if (occurences.size() > (unsigned int)threshold) {
288+
// if the number of students with matching code is more
289+
// than the threshold, it is considered common code
290+
submission_itr->addCommonMatch(hash_itr->second, *itr);
291+
} else {
292+
// save the match as a suspicous match
293+
submission_itr->addSuspiciousMatch(hash_itr->second, *itr);
294+
}
295+
// TODO: insert provided match here
239296
}
240297
}
241298
}
@@ -255,7 +312,7 @@ int main(int argc, char* argv[]) {
255312

256313
my_counter = 0;
257314
my_percent = 0;
258-
std::cout << "merging regions and writing matches files..." << std::endl;
315+
std::cout << "writing matches files and merging regions..." << std::endl;
259316

260317
// Loop over all of the submissions, writing a JSON file for each one if it has suspicious matches
261318
for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
@@ -267,6 +324,8 @@ int main(int argc, char* argv[]) {
267324
// holds the JSON file to be written
268325
std::vector<nlohmann::json> result;
269326

327+
328+
// ******** WRITE THE SUSPICIOUS MATCHES ********
270329
// all of the suspicious matches for this submission
271330
std::map<location_in_submission, std::set<HashLocation> > suspicious_matches = submission_itr->getSuspiciousMatches();
272331

@@ -277,7 +336,7 @@ int main(int argc, char* argv[]) {
277336
// stores matches of hash locations across other submssions in the class
278337
std::vector<nlohmann::json> others;
279338

280-
{
339+
{
281340
// generate a specific element of the "others" vector
282341
// set the variables to their initial values
283342
std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second.begin();
@@ -287,46 +346,112 @@ int main(int argc, char* argv[]) {
287346
std::vector<nlohmann::json> matchingpositions;
288347
nlohmann::json position;
289348
position["start"] = matching_positions_itr->location;
290-
position["end"] = matching_positions_itr->location + sequence_length;
349+
position["end"] = matching_positions_itr->location + sequence_length - 1;
291350
matchingpositions.push_back(position);
292-
other["matchingpositions"] = matchingpositions;
293-
351+
294352
// search for all matching positions of the suspicious match in other submissions
295353
if (location_itr->second.size() > 1) {
296354
++matching_positions_itr;
355+
297356
// loop over all of the other matching positions
298357
for (; matching_positions_itr != location_itr->second.end(); ++matching_positions_itr) {
358+
299359
// keep iterating and editing the same object until a we get to a different submission
300360
if (matching_positions_itr->student != other["username"] || matching_positions_itr->version != other["version"]) {
301361
// found a different one, we push the old one and start over
362+
other["matchingpositions"] = matchingpositions;
302363
others.push_back(other);
303364

304365
matchingpositions.clear();
305366
other["username"] = matching_positions_itr->student;
306367
other["version"] = matching_positions_itr->version;
307-
position["start"] = matching_positions_itr->location;
308-
position["end"] = matching_positions_itr->location + sequence_length;
309368
}
310369
position["start"] = matching_positions_itr->location;
311-
position["end"] = matching_positions_itr->location + sequence_length;
370+
position["end"] = matching_positions_itr->location + sequence_length - 1;
312371
matchingpositions.push_back(position);
313372
}
314373
}
374+
375+
other["matchingpositions"] = matchingpositions;
315376
others.push_back(other);
316377
}
317378

318379
nlohmann::json info;
319380
info["start"] = location_itr->first;
320-
info["end"] = location_itr->first + sequence_length;
381+
info["end"] = location_itr->first + sequence_length - 1;
321382
info["type"] = "match";
322383
info["others"] = others;
323384

324385
result.push_back(info);
325386
}
387+
// ********************************************
388+
389+
// ******** WRITE THE COMMON MATCHES ********
390+
// all of the common matches for this submission
391+
std::map<location_in_submission, std::set<HashLocation> > common_matches = submission_itr->getCommonMatches();
392+
393+
// loop over each of the common locations in the current submission
394+
for (std::map<location_in_submission, std::set<HashLocation> >::const_iterator location_itr
395+
=common_matches.begin(); location_itr != common_matches.end(); ++location_itr) {
396+
397+
// stores matches of hash locations across other submssions in the class
398+
std::vector<nlohmann::json> others;
399+
400+
{
401+
// generate a specific element of the "others" vector
402+
// set the variables to their initial values
403+
std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second.begin();
404+
nlohmann::json other;
405+
other["username"] = matching_positions_itr->student;
406+
other["version"] = matching_positions_itr->version;
407+
std::vector<nlohmann::json> matchingpositions;
408+
nlohmann::json position;
409+
position["start"] = matching_positions_itr->location;
410+
position["end"] = matching_positions_itr->location + sequence_length - 1;
411+
matchingpositions.push_back(position);
412+
413+
// search for all matching positions of the suspicious match in other submissions
414+
if (location_itr->second.size() > 1) {
415+
++matching_positions_itr;
416+
417+
// loop over all of the other matching positions
418+
for (; matching_positions_itr != location_itr->second.end(); ++matching_positions_itr) {
419+
420+
// keep iterating and editing the same object until a we get to a different submission
421+
if (matching_positions_itr->student != other["username"] || matching_positions_itr->version != other["version"]) {
422+
// found a different one, we push the old one and start over
423+
other["matchingpositions"] = matchingpositions;
424+
others.push_back(other);
425+
426+
matchingpositions.clear();
427+
other["username"] = matching_positions_itr->student;
428+
other["version"] = matching_positions_itr->version;
429+
}
430+
position["start"] = matching_positions_itr->location;
431+
position["end"] = matching_positions_itr->location + sequence_length - 1;
432+
matchingpositions.push_back(position);
433+
}
434+
}
435+
436+
other["matchingpositions"] = matchingpositions;
437+
others.push_back(other);
438+
}
439+
440+
nlohmann::json info;
441+
info["start"] = location_itr->first;
442+
info["end"] = location_itr->first + sequence_length - 1;
443+
info["type"] = "common";
444+
info["others"] = others;
445+
446+
result.push_back(info);
447+
}
448+
// ********************************************
449+
326450

327451
// ---------------------------------------------------------------------------
328452
// Done creating the JSON file/objects, now we merge them to shrink them in size
329453

454+
/*
330455
// Merge matching regions:
331456
if (result.size() > 0) { // check to make sure that there are more than 1 positions (if it's 1, we can't merge anyway)
332457
// loop through all positions
@@ -364,7 +489,7 @@ int main(int argc, char* argv[]) {
364489
}
365490
}
366491
}
367-
}
492+
}*/
368493

369494
// save the file with matches per user
370495
nlohmann::json match_data = result;
@@ -382,14 +507,16 @@ int main(int argc, char* argv[]) {
382507
my_percent = int((my_counter / float(all_submissions.size())) * 100);
383508
std::cout << "merging: " << my_percent << "% complete" << std::endl;
384509
}
510+
385511
}
386512
std::cout << "done merging and writing matches files" << std::endl;
387513

388514
// ---------------------------------------------------------------------------
389-
// Create a rankings of users by percentage match
515+
// Create a general summary of rankings of users by percentage match
390516

391-
std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/";
392-
std::string ranking_file = ranking_dir+gradeable+".txt";
517+
// create a single file of students ranked by highest percentage of code plagiarised
518+
std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"+gradeable+"/";
519+
std::string ranking_file = ranking_dir+"overall_ranking.txt";
393520
boost::filesystem::create_directories(ranking_dir);
394521
std::ofstream ranking_ostr(ranking_file);
395522

@@ -400,7 +527,7 @@ int main(int argc, char* argv[]) {
400527
for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
401528
submission_itr != all_submissions.end(); ++submission_itr) {
402529

403-
float percentMatch = (100.0 * submission_itr->getSuspiciousMatches().size()) / submission_itr->getHashes().size();
530+
float percentMatch = submission_itr->getPercentage();
404531

405532
std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr
406533
= highest_matches.find(submission_itr->student());
@@ -431,7 +558,51 @@ int main(int argc, char* argv[]) {
431558
<< std::setw(3) << std::right << ranking[i].version << std::endl;
432559
}
433560

561+
562+
// ---------------------------------------------------------------------------
563+
// create a rankings file for every submission. the file contains all the other
564+
// students share matches, sorted by decreasing order of the percent match
565+
566+
for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
567+
submission_itr != all_submissions.end(); ++submission_itr) {
568+
569+
// create the directory and a file to write into
570+
std::string ranking_student_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"
571+
+gradeable+"/"+submission_itr->student()+"/"+std::to_string(submission_itr->version())+"/";
572+
std::string ranking_student_file = ranking_student_dir+submission_itr->student()+"_"+std::to_string(submission_itr->version())+".txt";
573+
boost::filesystem::create_directories(ranking_student_dir);
574+
std::ofstream ranking_student_ostr(ranking_student_file);
575+
576+
// find and sort the other submissions it matches with
577+
std::vector<StudentRanking> student_ranking;
578+
std::unordered_map<std::string, std::unordered_map<int, int> > matches = submission_itr->getStudentsMatched();
579+
for (std::unordered_map<std::string, std::unordered_map<int, int> >::const_iterator matches_itr = matches.begin();
580+
matches_itr != matches.end(); ++matches_itr) {
581+
582+
for (std::unordered_map<int, int>::const_iterator version_itr = matches_itr->second.begin();
583+
version_itr != matches_itr->second.end(); ++version_itr) {
584+
585+
// the percent match is currently calculated using the number of hashes that match between this
586+
// submission and the other submission, over the total number of hashes this submission has.
587+
// In other words, the percentage is how much of this submission's code was plgairised from the other.
588+
float percent = 100.0 * float(version_itr->second) / submission_itr->getNumHashes();
589+
student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, percent));
590+
}
591+
}
592+
593+
std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
594+
595+
// finally, write the file of ranking for this submission
596+
for (unsigned int i = 0; i < student_ranking.size(); i++) {
597+
ranking_student_ostr
598+
<< std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].percent << "% "
599+
<< std::setw(15) << std::left << student_ranking[i].student << " "
600+
<< std::setw(3) << std::right << student_ranking[i].version << std::endl;
601+
}
602+
}
603+
604+
434605
// ---------------------------------------------------------------------------
435606
std::cout << "done" << std::endl;
436-
607+
437608
}

0 commit comments

Comments
 (0)