Skip to content

Commit bf7e931

Browse files
[Bugfix:Plagiarism] Fix buggy plaintext tokenization (#58)
* Rewrite plaintext tokenizer in Python instead of C++ * Update tests * Forgot minor detail * Update tokens.json
1 parent eb8259b commit bf7e931

File tree

8 files changed

+153
-222
lines changed

8 files changed

+153
-222
lines changed

bin/concatenate_all.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
from pathlib import Path
1515

1616
IGNORED_FILES = [
17-
".submit.timestamp"
17+
".submit.timestamp",
18+
".user_assignment_access.json"
1819
]
1920

2021
with open(Path(__file__).resolve().parent / "lichen_config.json") as lichen_config_file:

bin/tokenize_all.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,15 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
2929

3030
if not language_token_data.get("input_as_argument"):
3131
my_concatenated_file = f'< {my_concatenated_file}'
32-
cli_args = ' '.join(language_token_data["command_args"])\
33-
if "command_args" in language_token_data else ''
3432

35-
command = f'{language_token_data["command_executable"]} {tokenizer} {cli_args}\
36-
{my_concatenated_file} > {my_tokenized_file}'.strip()
33+
if "command_args" in language_token_data:
34+
cli_args = " ".join(language_token_data["command_args"])
35+
else:
36+
cli_args = ""
37+
38+
command = f"{language_token_data['command_executable']} {tokenizer} "\
39+
f"{cli_args} {my_concatenated_file} > {my_tokenized_file}".strip()
40+
3741
os.system(command)
3842

3943

install_lichen.sh

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,6 @@ fi
4949
mkdir -p ${lichen_installation_dir}/bin
5050
mkdir -p ${lichen_installation_dir}/tools/assignments
5151

52-
#--------------------
53-
# plaintext tool
54-
pushd ${lichen_repository_dir} > /dev/null
55-
clang++ -I ${lichen_vendor_dir} -std=c++11 -Wall -O3 tokenizer/plaintext/plaintext_tokenizer.cpp -o ${lichen_installation_dir}/bin/plaintext_tokenizer.out
56-
if [ $? -ne 0 ]; then
57-
echo -e "ERROR: FAILED TO BUILD PLAINTEXT TOKENIZER\n"
58-
exit 1
59-
fi
60-
popd > /dev/null
61-
6252

6353
#-------------------------------------------
6454
# compile & install the hash comparison tool
@@ -75,6 +65,7 @@ popd > /dev/null
7565

7666
cp ${lichen_repository_dir}/bin/* ${lichen_installation_dir}/bin/
7767

68+
cp ${lichen_repository_dir}/tokenizer/plaintext/plaintext_tokenizer.py ${lichen_installation_dir}/bin/plaintext_tokenizer.py
7869
cp ${lichen_repository_dir}/tokenizer/c/c_tokenizer.py ${lichen_installation_dir}/bin/c_tokenizer.py
7970
cp ${lichen_repository_dir}/tokenizer/python/python_tokenizer.py ${lichen_installation_dir}/bin/python_tokenizer.py
8071
cp ${lichen_repository_dir}/tokenizer/java/java_tokenizer.py ${lichen_installation_dir}/bin/java_tokenizer.py
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
null
1+
[]

tests/unittest/tests.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def testPlaintextTokenizer(self):
2323
output_file = Path(temp_dir, "output.json")
2424
expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output.json")
2525

26-
subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} < {str(input_file)} > {str(output_file)}", shell=True)
26+
subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} {str(input_file)} > {str(output_file)}", shell=True)
2727

2828
with open(output_file) as file:
2929
actual_output = json.load(file)
@@ -41,7 +41,7 @@ def testPlaintextTokenizerIgnorePunctuation(self):
4141
output_file = Path(temp_dir, "output.json")
4242
expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_punctuation.json")
4343

44-
subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_punctuation < {str(input_file)} > {str(output_file)}", shell=True)
44+
subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_punctuation {str(input_file)} > {str(output_file)}", shell=True)
4545

4646
with open(output_file) as file:
4747
actual_output = json.load(file)
@@ -59,7 +59,7 @@ def testPlaintextTokenizerToLower(self):
5959
output_file = Path(temp_dir, "output.json")
6060
expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_to_lower.json")
6161

62-
subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --to_lower < {str(input_file)} > {str(output_file)}", shell=True)
62+
subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --to_lower {str(input_file)} > {str(output_file)}", shell=True)
6363

6464
with open(output_file) as file:
6565
actual_output = json.load(file)
@@ -77,7 +77,7 @@ def testPlaintextTokenizerIgnoreNewlines(self):
7777
output_file = Path(temp_dir, "output.json")
7878
expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_newlines.json")
7979

80-
subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_newlines < {str(input_file)} > {str(output_file)}", shell=True)
80+
subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_newlines {str(input_file)} > {str(output_file)}", shell=True)
8181

8282
with open(output_file) as file:
8383
actual_output = json.load(file)
@@ -95,7 +95,7 @@ def testPlaintextTokenizerIgnoreEverything(self):
9595
output_file = Path(temp_dir, "output.json")
9696
expected_output_file = Path(test_data_dir, "tokenizer", "plaintext", "expected_output", "output_ignore_everything.json")
9797

98-
subprocess.check_call(f"{str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.out'))} --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines < {str(input_file)} > {str(output_file)}", shell=True)
98+
subprocess.check_call(f"python3 {str(Path(lichen_installation_dir, 'bin', 'plaintext_tokenizer.py'))} --ignore_punctuation --to_lower --ignore_numbers --ignore_newlines {str(input_file)} > {str(output_file)}", shell=True)
9999

100100
with open(output_file) as file:
101101
actual_output = json.load(file)

tokenizer/data.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"plaintext": {
3-
"tokenizer": "plaintext_tokenizer.out",
4-
"command_executable": "",
5-
"input_as_argument": false,
3+
"tokenizer": "plaintext_tokenizer.py",
4+
"command_executable": "python3",
5+
"input_as_argument": true,
66
"command_args": [
77
"--ignore_newlines"
88
],

tokenizer/plaintext/plaintext_tokenizer.cpp

Lines changed: 0 additions & 198 deletions
This file was deleted.

0 commit comments

Comments
 (0)