[Feature:Plagiarism] common code & individual ranking (#28)

sbelsk · williamjallen · web-flow · commit 7c6167903ee9 · 2021-06-07T13:47:05.000-04:00
* Refactor: fix warnings in C++, make code more readable

* Merge adjacent regions

* Nightly progress

* Reimplement algorithm

* Implement save matches

* Fix bugs

* Add rankings

* Minor changes

* Final touches

* Add "common code" functionality

* Debug output files

* Fix merging

* Fix bugs in writing the files

* Initial draft for individual student ranking

* Fix compiler errors

* Update ranking directory structure

* Minor fix to the directory path

* Another minor fix

Co-authored-by: williamjallen &lt;dalearn@icloud.com&gt;
diff --git a/compare_hashes/compare_hashes.cpp b/compare_hashes/compare_hashes.cpp
@@ -54,26 +54,56 @@ class Submission {
   const std::vector<std::pair<hash, location_in_submission>> & getHashes() const { return hashes; }
   void addSuspiciousMatch(location_in_submission location, const HashLocation &matching_location) {
     std::map<location_in_submission, std::set<HashLocation>>::iterator itr = suspicious_matches.find(location);
-
+    // TODO: is this if-else necessary? would this not work if we just did?: suspicious_matches[location].insert(matching_location);
     if (itr != suspicious_matches.end()) {
-      // location already exists in the map, so we just append the location to the vector
+      // location already exists in the map, so we just append the location to the set
       suspicious_matches[location].insert(matching_location);
     } else {
-      // intialize the vector and add the location
+      // intialize the set and add the location
       std::set<HashLocation> s;
       s.insert(matching_location);
       suspicious_matches[location] = s;
     }
+    // update the students_matched container
+    students_matched[matching_location.student][matching_location.version]++;
+  }
+  void addCommonMatch(location_in_submission location, const HashLocation &matching_location) {
+    std::map<location_in_submission, std::set<HashLocation>>::iterator itr = common_matches.find(location);
+
+    if (itr != common_matches.end()) {
+      // location already exists in the map, so we just append the location to the set
+      common_matches[location].insert(matching_location);
+    } else {
+      // intialize the set and add the location
+      std::set<HashLocation> s;
+      s.insert(matching_location);
+      common_matches[location] = s;
+    }
   }
   const std::map<location_in_submission, std::set<HashLocation> >& getSuspiciousMatches() const {
     return suspicious_matches;
   }
+  const std::map<location_in_submission, std::set<HashLocation> >& getCommonMatches() const {
+    return common_matches;
+  }
+  const std::unordered_map<std::string, std::unordered_map<int, int> >& getStudentsMatched() const {
+    return students_matched;
+  }
+  unsigned int getNumHashes() const { return hashes.size(); }
+  float getPercentage() const {
+    return (100.0 * (suspicious_matches.size() + common_matches.size())) / hashes.size();
+  }
 
 private:
   std::string student_;
   int version_;
   std::vector<std::pair<hash, location_in_submission> > hashes;
   std::map<location_in_submission, std::set<HashLocation> > suspicious_matches;
+  std::map<location_in_submission, std::set<HashLocation> > common_matches;
+
+  // a container to keep track of all the students this submission
+  // matched and the number of matching hashes per submission
+  std::unordered_map<std::string, std::unordered_map<int, int> > students_matched;
 };
 
 
@@ -116,13 +146,23 @@ bool matchingPositionsAreAdjacent(const nlohmann::json &first, const nlohmann::j
 }
 
 
-// increments the end position for each of the matches in the json provided
+// increments the end position for each of the matches in the json provided,
+// merging overlapping regions where necessary
 void incrementEndPositionsForMatches(nlohmann::json &matches) {
   nlohmann::json::iterator itr = matches.begin();
   for (; itr != matches.end(); itr++) {
     nlohmann::json::iterator itr2 = (*itr)["matchingpositions"].begin();
-    for (; itr2 != (*itr)["matchingpositions"].end(); itr2++) {
-      (*itr2)["end"] = (*itr2)["end"].get<int>() + 1;
+    nlohmann::json::iterator itr3 = ++((*itr)["matchingpositions"].begin());
+    for (; itr3 != (*itr)["matchingpositions"].end();) {
+      if ((*itr2)["end"].get<int>() >= (*itr3)["start"]) {
+        (*itr2)["end"] = (*itr3)["end"].get<int>();
+        itr3 = (*itr)["matchingpositions"].erase(itr3);
+      }
+      else {
+        (*itr2)["end"] = (*itr2)["end"].get<int>() + 1;
+        itr2++;
+        itr3++;
+      }
     }
   }
 }
@@ -174,8 +214,8 @@ int main(int argc, char* argv[]) {
   // ---------------------------------------------------------------------------
   // loop over all submissions and populate the all_hashes and all_submissions structures
 
-  // Stores all the hashes and their locations across all submissions
-  std::unordered_map<hash, std::vector<HashLocation>> all_hashes;
+  // Stores all the hashes and their locations across all submissions (sorted in "bins" of student names)
+  std::unordered_map<hash, std::unordered_map<std::string, std::vector<HashLocation>>> all_hashes;
   // Stores all submissions
   std::vector<Submission> all_submissions;
 
@@ -205,7 +245,7 @@ int main(int argc, char* argv[]) {
       int location = 0;
       while (istr >> input_hash) {
         location++;
-        all_hashes[input_hash].push_back(HashLocation(username, version, location));
+        all_hashes[input_hash][username].push_back(HashLocation(username, version, location));
         submission.addHash(input_hash, location);
       }
 
@@ -230,12 +270,29 @@ int main(int argc, char* argv[]) {
     std::vector<std::pair<hash, location_in_submission>>::const_iterator hash_itr = submission_itr->getHashes().begin();
     for (; hash_itr != submission_itr->getHashes().end(); ++hash_itr) {
 
-      // look up that hash in the all_hashes table, and look for occurences of that hash in other submisions
-      std::vector<HashLocation> occurences = all_hashes[hash_itr->first];
-      std::vector<HashLocation>::iterator occurences_itr = occurences.begin();
+      // look up that hash in the all_hashes table, loop over all other students that have the same hash
+      std::unordered_map<std::string, std::vector<HashLocation>> occurences = all_hashes[hash_itr->first];
+      std::unordered_map<std::string, std::vector<HashLocation>>::iterator occurences_itr = occurences.begin();
       for (; occurences_itr != occurences.end(); ++occurences_itr) {
-        if (occurences_itr->student != submission_itr->student()) {
-          submission_itr->addSuspiciousMatch(hash_itr->second, *occurences_itr);
+
+        // don't look for matches across submissions of the same student
+        if (occurences_itr->first == submission_itr->student()) {
+          continue;
+        }
+
+        // save the locations of all other occurences of the matching hash in other students' submissions
+        std::vector<HashLocation>::iterator itr = occurences_itr->second.begin();
+        for (; itr != occurences_itr->second.end(); ++itr) {
+
+          if (occurences.size() > (unsigned int)threshold) {
+            // if the number of students with matching code is more
+            // than the threshold, it is considered common code
+            submission_itr->addCommonMatch(hash_itr->second, *itr);
+          } else {
+            // save the match as a suspicous match
+            submission_itr->addSuspiciousMatch(hash_itr->second, *itr);
+          }
+          // TODO: insert provided match here
         }
       }
     }
@@ -255,7 +312,7 @@ int main(int argc, char* argv[]) {
 
   my_counter = 0;
   my_percent = 0;
-  std::cout << "merging regions and writing matches files..." << std::endl;
+  std::cout << "writing matches files and merging regions..." << std::endl;
 
   // Loop over all of the submissions, writing a JSON file for each one if it has suspicious matches
   for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
@@ -267,6 +324,8 @@ int main(int argc, char* argv[]) {
     // holds the JSON file to be written
     std::vector<nlohmann::json> result;
 
+
+    // ********  WRITE THE SUSPICIOUS MATCHES  ********
     // all of the suspicious matches for this submission
     std::map<location_in_submission, std::set<HashLocation> > suspicious_matches = submission_itr->getSuspiciousMatches();
 
@@ -277,7 +336,7 @@ int main(int argc, char* argv[]) {
       // stores matches of hash locations across other submssions in the class
       std::vector<nlohmann::json> others;
 
-      { 
+      {
         // generate a specific element of the "others" vector
         // set the variables to their initial values
         std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second.begin();
@@ -287,46 +346,112 @@ int main(int argc, char* argv[]) {
         std::vector<nlohmann::json> matchingpositions;
         nlohmann::json position;
         position["start"] = matching_positions_itr->location;
-        position["end"] = matching_positions_itr->location + sequence_length;
+        position["end"] = matching_positions_itr->location + sequence_length - 1;
         matchingpositions.push_back(position);
-        other["matchingpositions"] = matchingpositions;
-        
+
         // search for all matching positions of the suspicious match in other submissions
         if (location_itr->second.size() > 1) {
           ++matching_positions_itr;
+
           // loop over all of the other matching positions
           for (; matching_positions_itr != location_itr->second.end(); ++matching_positions_itr) {
+
             // keep iterating and editing the same object until a we get to a different submission
             if (matching_positions_itr->student != other["username"] || matching_positions_itr->version != other["version"]) {
               // found a different one, we push the old one and start over
+              other["matchingpositions"] = matchingpositions;
               others.push_back(other);
 
               matchingpositions.clear();
               other["username"] = matching_positions_itr->student;
               other["version"] = matching_positions_itr->version;
-              position["start"] = matching_positions_itr->location;
-              position["end"] = matching_positions_itr->location + sequence_length;
             }
             position["start"] = matching_positions_itr->location;
-            position["end"] = matching_positions_itr->location + sequence_length;
+            position["end"] = matching_positions_itr->location + sequence_length - 1;
             matchingpositions.push_back(position);
           }
         }
+
+        other["matchingpositions"] = matchingpositions;
         others.push_back(other);
       }
 
       nlohmann::json info;
       info["start"] = location_itr->first;
-      info["end"] = location_itr->first + sequence_length;
+      info["end"] = location_itr->first + sequence_length - 1;
       info["type"] = "match";
       info["others"] = others;
 
       result.push_back(info);
     }
+    // ********************************************
+
+    // ********  WRITE THE COMMON MATCHES  ********
+    // all of the common matches for this submission
+    std::map<location_in_submission, std::set<HashLocation> > common_matches = submission_itr->getCommonMatches();
+
+    // loop over each of the common locations in the current submission
+    for (std::map<location_in_submission, std::set<HashLocation> >::const_iterator location_itr
+         =common_matches.begin(); location_itr != common_matches.end(); ++location_itr) {
+
+      // stores matches of hash locations across other submssions in the class
+      std::vector<nlohmann::json> others;
+
+      {
+        // generate a specific element of the "others" vector
+        // set the variables to their initial values
+        std::set<HashLocation>::const_iterator matching_positions_itr = location_itr->second.begin();
+        nlohmann::json other;
+        other["username"] = matching_positions_itr->student;
+        other["version"] = matching_positions_itr->version;
+        std::vector<nlohmann::json> matchingpositions;
+        nlohmann::json position;
+        position["start"] = matching_positions_itr->location;
+        position["end"] = matching_positions_itr->location + sequence_length - 1;
+        matchingpositions.push_back(position);
+
+        // search for all matching positions of the suspicious match in other submissions
+        if (location_itr->second.size() > 1) {
+          ++matching_positions_itr;
+
+          // loop over all of the other matching positions
+          for (; matching_positions_itr != location_itr->second.end(); ++matching_positions_itr) {
+
+            // keep iterating and editing the same object until a we get to a different submission
+            if (matching_positions_itr->student != other["username"] || matching_positions_itr->version != other["version"]) {
+              // found a different one, we push the old one and start over
+              other["matchingpositions"] = matchingpositions;
+              others.push_back(other);
+
+              matchingpositions.clear();
+              other["username"] = matching_positions_itr->student;
+              other["version"] = matching_positions_itr->version;
+            }
+            position["start"] = matching_positions_itr->location;
+            position["end"] = matching_positions_itr->location + sequence_length - 1;
+            matchingpositions.push_back(position);
+          }
+        }
+
+        other["matchingpositions"] = matchingpositions;
+        others.push_back(other);
+      }
+
+      nlohmann::json info;
+      info["start"] = location_itr->first;
+      info["end"] = location_itr->first + sequence_length - 1;
+      info["type"] = "common";
+      info["others"] = others;
+
+      result.push_back(info);
+    }
+    // ********************************************
+
 
     // ---------------------------------------------------------------------------
     // Done creating the JSON file/objects, now we merge them to shrink them in size
 
+    /*
     // Merge matching regions:
     if (result.size() > 0) { // check to make sure that there are more than 1 positions (if it's 1, we can't merge anyway)
       // loop through all positions
@@ -364,7 +489,7 @@ int main(int argc, char* argv[]) {
           }
         }
       }
-    }
+    }*/
 
     // save the file with matches per user
     nlohmann::json match_data = result;
@@ -382,14 +507,16 @@ int main(int argc, char* argv[]) {
       my_percent = int((my_counter / float(all_submissions.size())) * 100);
       std::cout << "merging: " << my_percent << "% complete" << std::endl;
     }
+
   }
   std::cout << "done merging and writing matches files" << std::endl;
 
   // ---------------------------------------------------------------------------
-  // Create a rankings of users by percentage match
+  // Create a general summary of rankings of users by percentage match
 
-  std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/";
-  std::string ranking_file = ranking_dir+gradeable+".txt";
+  // create a single file of students ranked by highest percentage of code plagiarised
+  std::string ranking_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"+gradeable+"/";
+  std::string ranking_file = ranking_dir+"overall_ranking.txt";
   boost::filesystem::create_directories(ranking_dir);
   std::ofstream ranking_ostr(ranking_file);
 
@@ -400,7 +527,7 @@ int main(int argc, char* argv[]) {
   for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
        submission_itr != all_submissions.end(); ++submission_itr) {
 
-    float percentMatch = (100.0 * submission_itr->getSuspiciousMatches().size()) / submission_itr->getHashes().size();
+    float percentMatch = submission_itr->getPercentage();
 
     std::unordered_map<std::string, std::pair<int, float> >::iterator highest_matches_itr
         = highest_matches.find(submission_itr->student());
@@ -431,7 +558,51 @@ int main(int argc, char* argv[]) {
       << std::setw(3) << std::right << ranking[i].version << std::endl;
   }
 
+
+  // ---------------------------------------------------------------------------
+  // create a rankings file for every submission. the file contains all the other
+  // students share matches, sorted by decreasing order of the percent match
+
+  for (std::vector<Submission>::iterator submission_itr = all_submissions.begin();
+       submission_itr != all_submissions.end(); ++submission_itr) {
+
+    // create the directory and a file to write into
+    std::string ranking_student_dir = "/var/local/submitty/courses/"+semester+"/"+course+"/lichen/ranking/"
+                                      +gradeable+"/"+submission_itr->student()+"/"+std::to_string(submission_itr->version())+"/";
+    std::string ranking_student_file = ranking_student_dir+submission_itr->student()+"_"+std::to_string(submission_itr->version())+".txt";
+    boost::filesystem::create_directories(ranking_student_dir);
+    std::ofstream ranking_student_ostr(ranking_student_file);
+
+    // find and sort the other submissions it matches with
+    std::vector<StudentRanking> student_ranking;
+    std::unordered_map<std::string, std::unordered_map<int, int> > matches = submission_itr->getStudentsMatched();
+    for (std::unordered_map<std::string, std::unordered_map<int, int> >::const_iterator matches_itr = matches.begin();
+         matches_itr != matches.end(); ++matches_itr) {
+
+      for (std::unordered_map<int, int>::const_iterator version_itr = matches_itr->second.begin();
+           version_itr != matches_itr->second.end(); ++version_itr) {
+
+        // the percent match is currently calculated using the number of hashes that match between this
+        // submission and the other submission, over the total number of hashes this submission has.
+        // In other words, the percentage is how much of this submission's code was plgairised from the other.
+        float percent = 100.0 * float(version_itr->second) / submission_itr->getNumHashes();
+        student_ranking.push_back(StudentRanking(matches_itr->first, version_itr->first, percent));
+      }
+    }
+
+    std::sort(student_ranking.begin(), student_ranking.end(), ranking_sorter);
+
+    // finally, write the file of ranking for this submission
+    for (unsigned int i = 0; i < student_ranking.size(); i++) {
+      ranking_student_ostr
+        << std::setw(6) << std::setprecision(2) << std::fixed << student_ranking[i].percent << "%   "
+        << std::setw(15) << std::left << student_ranking[i].student << " "
+        << std::setw(3) << std::right << student_ranking[i].version << std::endl;
+    }
+  }
+
+
   // ---------------------------------------------------------------------------
   std::cout << "done" << std::endl;
-  
+
 }