[Bugfix:Plagiarism] Fix buggy plaintext tokenization (#58)

williamjallen · web-flow · commit bf7e931e296b · 2021-08-19T10:07:08.000-04:00
* Rewrite plaintext tokenizer in Python instead of C++

* Update tests

* Forgot minor detail

* Update tokens.json
diff --git a/bin/concatenate_all.py b/bin/concatenate_all.py
@@ -14,7 +14,8 @@
 from pathlib import Path
 
 IGNORED_FILES = [
-    ".submit.timestamp"
+    ".submit.timestamp",
+    ".user_assignment_access.json"
 ]
 
 with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
@@ -29,11 +29,15 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
 
     if not language_token_data.get("input_as_argument"):
         my_concatenated_file = f'< {my_concatenated_file}'
-    cli_args = ' '.join(language_token_data["command_args"])\
-               if "command_args" in language_token_data else ''
 
-    command = f'{language_token_data["command_executable"]} {tokenizer} {cli_args}\
-                    {my_concatenated_file} > {my_tokenized_file}'.strip()
+    if "command_args" in language_token_data:
+        cli_args = " ".join(language_token_data["command_args"])
+    else:
+        cli_args = ""
+
+    command = f"{language_token_data['command_executable']} {tokenizer} "\
+              f"{cli_args} {my_concatenated_file} > {my_tokenized_file}".strip()
+
     os.system(command)
 
 
diff --git a/install_lichen.sh b/install_lichen.sh
@@ -49,16 +49,6 @@ fi
 mkdir -p ${lichen_installation_dir}/bin
 mkdir -p ${lichen_installation_dir}/tools/assignments
 
-#--------------------
-# plaintext tool
-pushd ${lichen_repository_dir}  > /dev/null
-clang++ -I ${lichen_vendor_dir} -std=c++11 -Wall -O3 tokenizer/plaintext/plaintext_tokenizer.cpp -o ${lichen_installation_dir}/bin/plaintext_tokenizer.out
-if [ $? -ne 0 ]; then
-    echo -e "ERROR: FAILED TO BUILD PLAINTEXT TOKENIZER\n"
-    exit 1
-fi
-popd > /dev/null
-
 
 #-------------------------------------------
 # compile & install the hash comparison tool
@@ -75,6 +65,7 @@ popd > /dev/null
 
 cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/
 
+cp ${lichen_repository_dir}/tokenizer/plaintext/plaintext_tokenizer.py ${lichen_installation_dir}/bin/plaintext_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
 cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
diff --git a/tests/data/test_lichen/repeated_sequences/expected_output/provided_code/tokens.json b/tests/data/test_lichen/repeated_sequences/expected_output/provided_code/tokens.json
@@ -1 +1 @@
-null
+[]
diff --git a/tests/unittest/tests.py b/tests/unittest/tests.py
@@ -23,7 +23,7 @@ def testPlaintextTokenizer(self):
             output_file = Path(temp_dir, "output.json")
             expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output.json")
 
-            subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} < {str(input_file)} > {str(output_file)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} {str(input_file)} > {str(output_file)}", shell=True)
 
             with open(output_file) as file:
                 actual_output = json.load(file)
@@ -41,7 +41,7 @@ def testPlaintextTokenizerIgnorePunctuation(self):
             output_file = Path(temp_dir, "output.json")
             expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_punctuation.json")
 
-            subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_punctuation < {str(input_file)} > {str(output_file)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_punctuation {str(input_file)} > {str(output_file)}", shell=True)
 
             with open(output_file) as file:
                 actual_output = json.load(file)
@@ -59,7 +59,7 @@ def testPlaintextTokenizerToLower(self):
             output_file = Path(temp_dir, "output.json")
             expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_to_lower.json")
 
-            subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --to_lower < {str(input_file)} > {str(output_file)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --to_lower {str(input_file)} > {str(output_file)}", shell=True)
 
             with open(output_file) as file:
                 actual_output = json.load(file)
@@ -77,7 +77,7 @@ def testPlaintextTokenizerIgnoreNewlines(self):
             output_file = Path(temp_dir, "output.json")
             expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_newlines.json")
 
-            subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_newlines < {str(input_file)} > {str(output_file)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_newlines {str(input_file)} > {str(output_file)}", shell=True)
 
             with open(output_file) as file:
                 actual_output = json.load(file)
@@ -95,7 +95,7 @@ def testPlaintextTokenizerIgnoreEverything(self):
             output_file = Path(temp_dir, "output.json")
             expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_everything.json")
 
-            subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines  < {str(input_file)} > {str(output_file)}", shell=True)
+            subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines {str(input_file)} > {str(output_file)}", shell=True)
 
             with open(output_file) as file:
                 actual_output = json.load(file)
diff --git a/tokenizer/data.json b/tokenizer/data.json
@@ -1,8 +1,8 @@
 {
   "plaintext": {
-    "tokenizer": "plaintext_tokenizer.out",
-    "command_executable": "",
-    "input_as_argument": false,
+    "tokenizer": "plaintext_tokenizer.py",
+    "command_executable": "python3",
+    "input_as_argument": true,
     "command_args": [
       "--ignore_newlines"
     ],
diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp
diff --git a/tokenizer/plaintext/plaintext_tokenizer.py b/tokenizer/plaintext/plaintext_tokenizer.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,8 @@`
`14`	`14`	`from pathlib import Path`
`15`	`15`
`16`	`16`	`IGNORED_FILES = [`
`17`		`- ".submit.timestamp"`
	`17`	`+ ".submit.timestamp",`
	`18`	`+ ".user_assignment_access.json"`
`18`	`19`	`]`
`19`	`20`
`20`	`21`	`with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:`