@@ -54,26 +54,56 @@ class Submission {
5454 const std::vector<std::pair<hash, location_in_submission>> & getHashes () const { return hashes; }
5555 void addSuspiciousMatch (location_in_submission location, const HashLocation &matching_location) {
5656 std::map<location_in_submission, std::set<HashLocation>>::iterator itr = suspicious_matches.find (location);
57-
57+ // TODO: is this if-else necessary? would this not work if we just did?: suspicious_matches[location].insert(matching_location);
5858 if (itr != suspicious_matches.end ()) {
59- // location already exists in the map, so we just append the location to the vector
59+ // location already exists in the map, so we just append the location to the set
6060 suspicious_matches[location].insert (matching_location);
6161 } else {
62- // intialize the vector and add the location
62+ // intialize the set and add the location
6363 std::set<HashLocation> s;
6464 s.insert (matching_location);
6565 suspicious_matches[location] = s;
6666 }
67+ // update the students_matched container
68+ students_matched[matching_location.student ][matching_location.version ]++;
69+ }
70+ void addCommonMatch (location_in_submission location, const HashLocation &matching_location) {
71+ std::map<location_in_submission, std::set<HashLocation>>::iterator itr = common_matches.find (location);
72+
73+ if (itr != common_matches.end ()) {
74+ // location already exists in the map, so we just append the location to the set
75+ common_matches[location].insert (matching_location);
76+ } else {
77+ // intialize the set and add the location
78+ std::set<HashLocation> s;
79+ s.insert (matching_location);
80+ common_matches[location] = s;
81+ }
6782 }
6883 const std::map<location_in_submission, std::set<HashLocation> >& getSuspiciousMatches () const {
6984 return suspicious_matches;
7085 }
86+ const std::map<location_in_submission, std::set<HashLocation> >& getCommonMatches () const {
87+ return common_matches;
88+ }
89+ const std::unordered_map<std::string, std::unordered_map<int , int > >& getStudentsMatched () const {
90+ return students_matched;
91+ }
92+ unsigned int getNumHashes () const { return hashes.size (); }
93+ float getPercentage () const {
94+ return (100.0 * (suspicious_matches.size () + common_matches.size ())) / hashes.size ();
95+ }
7196
7297private:
7398 std::string student_;
7499 int version_;
75100 std::vector<std::pair<hash, location_in_submission> > hashes;
76101 std::map<location_in_submission, std::set<HashLocation> > suspicious_matches;
102+ std::map<location_in_submission, std::set<HashLocation> > common_matches;
103+
104+ // a container to keep track of all the students this submission
105+ // matched and the number of matching hashes per submission
106+ std::unordered_map<std::string, std::unordered_map<int , int > > students_matched;
77107};
78108
79109
@@ -116,13 +146,23 @@ bool matchingPositionsAreAdjacent(const nlohmann::json &first, const nlohmann::j
116146}
117147
118148
119- // increments the end position for each of the matches in the json provided
149+ // increments the end position for each of the matches in the json provided,
150+ // merging overlapping regions where necessary
120151void incrementEndPositionsForMatches (nlohmann::json &matches) {
121152 nlohmann::json::iterator itr = matches.begin ();
122153 for (; itr != matches.end (); itr++) {
123154 nlohmann::json::iterator itr2 = (*itr)[" matchingpositions" ].begin ();
124- for (; itr2 != (*itr)[" matchingpositions" ].end (); itr2++) {
125- (*itr2)[" end" ] = (*itr2)[" end" ].get <int >() + 1 ;
155+ nlohmann::json::iterator itr3 = ++((*itr)[" matchingpositions" ].begin ());
156+ for (; itr3 != (*itr)[" matchingpositions" ].end ();) {
157+ if ((*itr2)[" end" ].get <int >() >= (*itr3)[" start" ]) {
158+ (*itr2)[" end" ] = (*itr3)[" end" ].get <int >();
159+ itr3 = (*itr)[" matchingpositions" ].erase (itr3);
160+ }
161+ else {
162+ (*itr2)[" end" ] = (*itr2)[" end" ].get <int >() + 1 ;
163+ itr2++;
164+ itr3++;
165+ }
126166 }
127167 }
128168}
@@ -174,8 +214,8 @@ int main(int argc, char* argv[]) {
174214 // ---------------------------------------------------------------------------
175215 // loop over all submissions and populate the all_hashes and all_submissions structures
176216
177- // Stores all the hashes and their locations across all submissions
178- std::unordered_map<hash, std::vector<HashLocation>> all_hashes;
217+ // Stores all the hashes and their locations across all submissions (sorted in "bins" of student names)
218+ std::unordered_map<hash, std::unordered_map<std::string, std:: vector<HashLocation> >> all_hashes;
179219 // Stores all submissions
180220 std::vector<Submission> all_submissions;
181221
@@ -205,7 +245,7 @@ int main(int argc, char* argv[]) {
205245 int location = 0 ;
206246 while (istr >> input_hash) {
207247 location++;
208- all_hashes[input_hash].push_back (HashLocation (username, version, location));
248+ all_hashes[input_hash][username] .push_back (HashLocation (username, version, location));
209249 submission.addHash (input_hash, location);
210250 }
211251
@@ -230,12 +270,29 @@ int main(int argc, char* argv[]) {
230270 std::vector<std::pair<hash, location_in_submission>>::const_iterator hash_itr = submission_itr->getHashes ().begin ();
231271 for (; hash_itr != submission_itr->getHashes ().end (); ++hash_itr) {
232272
233- // look up that hash in the all_hashes table, and look for occurences of that hash in other submisions
234- std::vector<HashLocation> occurences = all_hashes[hash_itr->first ];
235- std::vector<HashLocation>::iterator occurences_itr = occurences.begin ();
273+ // look up that hash in the all_hashes table, loop over all other students that have the same hash
274+ std::unordered_map<std::string, std:: vector<HashLocation> > occurences = all_hashes[hash_itr->first ];
275+ std::unordered_map<std::string, std:: vector<HashLocation> >::iterator occurences_itr = occurences.begin ();
236276 for (; occurences_itr != occurences.end (); ++occurences_itr) {
237- if (occurences_itr->student != submission_itr->student ()) {
238- submission_itr->addSuspiciousMatch (hash_itr->second , *occurences_itr);
277+
278+ // don't look for matches across submissions of the same student
279+ if (occurences_itr->first == submission_itr->student ()) {
280+ continue ;
281+ }
282+
283+ // save the locations of all other occurences of the matching hash in other students' submissions
284+ std::vector<HashLocation>::iterator itr = occurences_itr->second .begin ();
285+ for (; itr != occurences_itr->second .end (); ++itr) {
286+
287+ if (occurences.size () > (unsigned int )threshold) {
288+ // if the number of students with matching code is more
289+ // than the threshold, it is considered common code
290+ submission_itr->addCommonMatch (hash_itr->second , *itr);
291+ } else {
292+ // save the match as a suspicous match
293+ submission_itr->addSuspiciousMatch (hash_itr->second , *itr);
294+ }
295+ // TODO: insert provided match here
239296 }
240297 }
241298 }
@@ -255,7 +312,7 @@ int main(int argc, char* argv[]) {
255312
256313 my_counter = 0 ;
257314 my_percent = 0 ;
258- std::cout << " merging regions and writing matches files..." << std::endl;
315+ std::cout << " writing matches files and merging regions ..." << std::endl;
259316
260317 // Loop over all of the submissions, writing a JSON file for each one if it has suspicious matches
261318 for (std::vector<Submission>::iterator submission_itr = all_submissions.begin ();
@@ -267,6 +324,8 @@ int main(int argc, char* argv[]) {
267324 // holds the JSON file to be written
268325 std::vector<nlohmann::json> result;
269326
327+
328+ // ******** WRITE THE SUSPICIOUS MATCHES ********
270329 // all of the suspicious matches for this submission
271330 std::map<location_in_submission, std::set<HashLocation> > suspicious_matches = submission_itr->getSuspiciousMatches ();
272331
@@ -277,7 +336,7 @@ int main(int argc, char* argv[]) {
277336 // stores matches of hash locations across other submssions in the class
278337 std::vector<nlohmann::json> others;
279338
280- {
339+ {
281340 // generate a specific element of the "others" vector
282341 // set the variables to their initial values
283342 std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second .begin ();
@@ -287,46 +346,112 @@ int main(int argc, char* argv[]) {
287346 std::vector<nlohmann::json> matchingpositions;
288347 nlohmann::json position;
289348 position[" start" ] = matching_positions_itr->location ;
290- position[" end" ] = matching_positions_itr->location + sequence_length;
349+ position[" end" ] = matching_positions_itr->location + sequence_length - 1 ;
291350 matchingpositions.push_back (position);
292- other[" matchingpositions" ] = matchingpositions;
293-
351+
294352 // search for all matching positions of the suspicious match in other submissions
295353 if (location_itr->second .size () > 1 ) {
296354 ++matching_positions_itr;
355+
297356 // loop over all of the other matching positions
298357 for (; matching_positions_itr != location_itr->second .end (); ++matching_positions_itr) {
358+
299359 // keep iterating and editing the same object until a we get to a different submission
300360 if (matching_positions_itr->student != other[" username" ] || matching_positions_itr->version != other[" version" ]) {
301361 // found a different one, we push the old one and start over
362+ other[" matchingpositions" ] = matchingpositions;
302363 others.push_back (other);
303364
304365 matchingpositions.clear ();
305366 other[" username" ] = matching_positions_itr->student ;
306367 other[" version" ] = matching_positions_itr->version ;
307- position[" start" ] = matching_positions_itr->location ;
308- position[" end" ] = matching_positions_itr->location + sequence_length;
309368 }
310369 position[" start" ] = matching_positions_itr->location ;
311- position[" end" ] = matching_positions_itr->location + sequence_length;
370+ position[" end" ] = matching_positions_itr->location + sequence_length - 1 ;
312371 matchingpositions.push_back (position);
313372 }
314373 }
374+
375+ other[" matchingpositions" ] = matchingpositions;
315376 others.push_back (other);
316377 }
317378
318379 nlohmann::json info;
319380 info[" start" ] = location_itr->first ;
320- info[" end" ] = location_itr->first + sequence_length;
381+ info[" end" ] = location_itr->first + sequence_length - 1 ;
321382 info[" type" ] = " match" ;
322383 info[" others" ] = others;
323384
324385 result.push_back (info);
325386 }
387+ // ********************************************
388+
389+ // ******** WRITE THE COMMON MATCHES ********
390+ // all of the common matches for this submission
391+ std::map<location_in_submission, std::set<HashLocation> > common_matches = submission_itr->getCommonMatches ();
392+
393+ // loop over each of the common locations in the current submission
394+ for (std::map<location_in_submission, std::set<HashLocation> >::const_iterator location_itr
395+ =common_matches.begin (); location_itr != common_matches.end (); ++location_itr) {
396+
397+ // stores matches of hash locations across other submssions in the class
398+ std::vector<nlohmann::json> others;
399+
400+ {
401+ // generate a specific element of the "others" vector
402+ // set the variables to their initial values
403+ std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second .begin ();
404+ nlohmann::json other;
405+ other[" username" ] = matching_positions_itr->student ;
406+ other[" version" ] = matching_positions_itr->version ;
407+ std::vector<nlohmann::json> matchingpositions;
408+ nlohmann::json position;
409+ position[" start" ] = matching_positions_itr->location ;
410+ position[" end" ] = matching_positions_itr->location + sequence_length - 1 ;
411+ matchingpositions.push_back (position);
412+
413+ // search for all matching positions of the suspicious match in other submissions
414+ if (location_itr->second .size () > 1 ) {
415+ ++matching_positions_itr;
416+
417+ // loop over all of the other matching positions
418+ for (; matching_positions_itr != location_itr->second .end (); ++matching_positions_itr) {
419+
420+ // keep iterating and editing the same object until a we get to a different submission
421+ if (matching_positions_itr->student != other[" username" ] || matching_positions_itr->version != other[" version" ]) {
422+ // found a different one, we push the old one and start over
423+ other[" matchingpositions" ] = matchingpositions;
424+ others.push_back (other);
425+
426+ matchingpositions.clear ();
427+ other[" username" ] = matching_positions_itr->student ;
428+ other[" version" ] = matching_positions_itr->version ;
429+ }
430+ position[" start" ] = matching_positions_itr->location ;
431+ position[" end" ] = matching_positions_itr->location + sequence_length - 1 ;
432+ matchingpositions.push_back (position);
433+ }
434+ }
435+
436+ other[" matchingpositions" ] = matchingpositions;
437+ others.push_back (other);
438+ }
439+
440+ nlohmann::json info;
441+ info[" start" ] = location_itr->first ;
442+ info[" end" ] = location_itr->first + sequence_length - 1 ;
443+ info[" type" ] = " common" ;
444+ info[" others" ] = others;
445+
446+ result.push_back (info);
447+ }
448+ // ********************************************
449+
326450
327451 // ---------------------------------------------------------------------------
328452 // Done creating the JSON file/objects, now we merge them to shrink them in size
329453
454+ /*
330455 // Merge matching regions:
331456 if (result.size() > 0) { // check to make sure that there are more than 1 positions (if it's 1, we can't merge anyway)
332457 // loop through all positions
@@ -364,7 +489,7 @@ int main(int argc, char* argv[]) {
364489 }
365490 }
366491 }
367- }
492+ }*/
368493
369494 // save the file with matches per user
370495 nlohmann::json match_data = result;
@@ -382,14 +507,16 @@ int main(int argc, char* argv[]) {
382507 my_percent = int ((my_counter / float (all_submissions.size ())) * 100 );
383508 std::cout << " merging: " << my_percent << " % complete" << std::endl;
384509 }
510+
385511 }
386512 std::cout << " done merging and writing matches files" << std::endl;
387513
388514 // ---------------------------------------------------------------------------
389- // Create a rankings of users by percentage match
515+ // Create a general summary of rankings of users by percentage match
390516
391- std::string ranking_dir = " /var/local/submitty/courses/" +semester+" /" +course+" /lichen/ranking/" ;
392- std::string ranking_file = ranking_dir+gradeable+" .txt" ;
517+ // create a single file of students ranked by highest percentage of code plagiarised
518+ std::string ranking_dir = " /var/local/submitty/courses/" +semester+" /" +course+" /lichen/ranking/" +gradeable+" /" ;
519+ std::string ranking_file = ranking_dir+" overall_ranking.txt" ;
393520 boost::filesystem::create_directories (ranking_dir);
394521 std::ofstream ranking_ostr (ranking_file);
395522
@@ -400,7 +527,7 @@ int main(int argc, char* argv[]) {
400527 for (std::vector<Submission>::iterator submission_itr = all_submissions.begin ();
401528 submission_itr != all_submissions.end (); ++submission_itr) {
402529
403- float percentMatch = ( 100.0 * submission_itr->getSuspiciousMatches (). size ()) / submission_itr-> getHashes (). size ();
530+ float percentMatch = submission_itr->getPercentage ();
404531
405532 std::unordered_map<std::string, std::pair<int , float > >::iterator highest_matches_itr
406533 = highest_matches.find (submission_itr->student ());
@@ -431,7 +558,51 @@ int main(int argc, char* argv[]) {
431558 << std::setw (3 ) << std::right << ranking[i].version << std::endl;
432559 }
433560
561+
562+ // ---------------------------------------------------------------------------
563+ // create a rankings file for every submission. the file contains all the other
564+ // students share matches, sorted by decreasing order of the percent match
565+
566+ for (std::vector<Submission>::iterator submission_itr = all_submissions.begin ();
567+ submission_itr != all_submissions.end (); ++submission_itr) {
568+
569+ // create the directory and a file to write into
570+ std::string ranking_student_dir = " /var/local/submitty/courses/" +semester+" /" +course+" /lichen/ranking/"
571+ +gradeable+" /" +submission_itr->student ()+" /" +std::to_string (submission_itr->version ())+" /" ;
572+ std::string ranking_student_file = ranking_student_dir+submission_itr->student ()+" _" +std::to_string (submission_itr->version ())+" .txt" ;
573+ boost::filesystem::create_directories (ranking_student_dir);
574+ std::ofstream ranking_student_ostr (ranking_student_file);
575+
576+ // find and sort the other submissions it matches with
577+ std::vector<StudentRanking> student_ranking;
578+ std::unordered_map<std::string, std::unordered_map<int , int > > matches = submission_itr->getStudentsMatched ();
579+ for (std::unordered_map<std::string, std::unordered_map<int , int > >::const_iterator matches_itr = matches.begin ();
580+ matches_itr != matches.end (); ++matches_itr) {
581+
582+ for (std::unordered_map<int , int >::const_iterator version_itr = matches_itr->second .begin ();
583+ version_itr != matches_itr->second .end (); ++version_itr) {
584+
585+ // the percent match is currently calculated using the number of hashes that match between this
586+ // submission and the other submission, over the total number of hashes this submission has.
587+ // In other words, the percentage is how much of this submission's code was plgairised from the other.
588+ float percent = 100.0 * float (version_itr->second ) / submission_itr->getNumHashes ();
589+ student_ranking.push_back (StudentRanking (matches_itr->first , version_itr->first , percent));
590+ }
591+ }
592+
593+ std::sort (student_ranking.begin (), student_ranking.end (), ranking_sorter);
594+
595+ // finally, write the file of ranking for this submission
596+ for (unsigned int i = 0 ; i < student_ranking.size (); i++) {
597+ ranking_student_ostr
598+ << std::setw (6 ) << std::setprecision (2 ) << std::fixed << student_ranking[i].percent << " % "
599+ << std::setw (15 ) << std::left << student_ranking[i].student << " "
600+ << std::setw (3 ) << std::right << student_ranking[i].version << std::endl;
601+ }
602+ }
603+
604+
434605 // ---------------------------------------------------------------------------
435606 std::cout << " done" << std::endl;
436-
607+
437608}
0 commit comments