1010import json
1111import time
1212import hashlib
13+ from pathlib import Path
1314
1415
1516def parse_args ():
@@ -18,9 +19,9 @@ def parse_args():
1819 return parser .parse_args ()
1920
2021
21- def hasher (lichen_config_data , my_tokenized_file , my_hashes_file ):
22- language = lichen_config_data ["language" ]
23- sequence_length = int (lichen_config_data ["sequence_length" ])
22+ def hasher (lichen_config , lichen_run_config , my_tokenized_file , my_hashes_file ):
23+ language = lichen_run_config ["language" ]
24+ sequence_length = int (lichen_run_config ["sequence_length" ])
2425
2526 data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
2627 with open (data_json_path ) as token_data_file :
@@ -39,69 +40,76 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
3940 token_values [x :x + sequence_length ]).encode ())
4041 .hexdigest ())[0 :8 ] for x in range (0 , num - sequence_length + 1 )]
4142
43+ if len (token_hashed_values ) > lichen_config ["max_sequences_per_file" ]:
44+ token_hashed_values = token_hashed_values [slice (0 , lichen_config ["max_sequences_per_file" ])] # noqa E501
45+ print (f"File { my_hashes_file } truncated after exceeding max sequence limit" )
46+
4247 my_hf .write ('\n ' .join (token_hashed_values ))
4348
4449
4550def main ():
4651 start_time = time .time ()
4752 args = parse_args ()
4853
49- with open (os .path .join (args .basepath , "config.json" )) as lichen_config :
50- lichen_config_data = json .load (lichen_config )
54+ with open (Path (args .basepath , "config.json" )) as lichen_run_config_file :
55+ lichen_run_config = json .load (lichen_run_config_file )
56+
57+ with open (Path (__file__ ).resolve ().parent / "lichen_config.json" ) as lichen_config_file :
58+ lichen_config = json .load (lichen_config_file )
5159
5260 print ("HASH ALL..." , end = "" )
5361
5462 # ==========================================================================
5563 # walk the subdirectories of this gradeable
56- users_dir = os . path . join (args .basepath , "users" )
64+ users_dir = Path (args .basepath , "users" )
5765 if not os .path .isdir (users_dir ):
5866 raise SystemExit ("ERROR! Unable to find users directory" )
5967
6068 for user in sorted (os .listdir (users_dir )):
61- user_dir = os . path . join (users_dir , user )
69+ user_dir = Path (users_dir , user )
6270 if not os .path .isdir (user_dir ):
6371 continue
6472
6573 for version in sorted (os .listdir (user_dir )):
66- my_dir = os . path . join (user_dir , version )
74+ my_dir = Path (user_dir , version )
6775 if not os .path .isdir (my_dir ):
6876 continue
6977
70- my_tokenized_file = os . path . join (my_dir , "tokens.json" )
71- my_hashes_file = os . path . join (my_dir , "hashes.txt" )
72- hasher (lichen_config_data , my_tokenized_file , my_hashes_file )
78+ my_tokenized_file = Path (my_dir , "tokens.json" )
79+ my_hashes_file = Path (my_dir , "hashes.txt" )
80+ hasher (lichen_config , lichen_run_config , my_tokenized_file , my_hashes_file )
7381
7482 # ==========================================================================
7583 # walk the subdirectories of the other gradeables
7684
77- other_gradeables_dir = os . path . join (args .basepath , "other_gradeables" )
85+ other_gradeables_dir = Path (args .basepath , "other_gradeables" )
7886 if not os .path .isdir (other_gradeables_dir ):
7987 raise SystemExit ("ERROR! Unable to find other gradeables directory" )
8088
8189 for other_gradeable in sorted (os .listdir (other_gradeables_dir )):
82- other_gradeable_dir = os . path . join (other_gradeables_dir , other_gradeable )
90+ other_gradeable_dir = Path (other_gradeables_dir , other_gradeable )
8391 if not os .path .isdir (other_gradeable_dir ):
8492 continue
8593
8694 for other_user in sorted (os .listdir (other_gradeable_dir )):
87- other_user_dir = os . path . join (other_gradeable_dir , other_user )
95+ other_user_dir = Path (other_gradeable_dir , other_user )
8896 if not os .path .isdir (other_user_dir ):
8997 continue
9098
9199 for other_version in sorted (os .listdir (other_user_dir )):
92- other_version_dir = os . path . join (other_user_dir , other_version )
100+ other_version_dir = Path (other_user_dir , other_version )
93101 if not os .path .isdir (other_version_dir ):
94102 continue
95103
96- other_tokenized_file = os . path . join (other_version_dir , "tokens.json" )
97- other_hashes_file = os . path . join (other_version_dir , "hashes.txt" )
98- hasher (lichen_config_data , other_tokenized_file , other_hashes_file )
104+ other_tokenized_file = Path (other_version_dir , "tokens.json" )
105+ other_hashes_file = Path (other_version_dir , "hashes.txt" )
106+ hasher (lichen_config , lichen_run_config , other_tokenized_file , other_hashes_file )
99107
100108 # ==========================================================================
101109 # hash the provided code
102- provided_code_tokenized = os . path . join (args .basepath , "provided_code" , "tokens.json" )
103- provided_code_hashed = os . path . join (args .basepath , "provided_code" , "hashes.txt" )
104- hasher (lichen_config_data , provided_code_tokenized , provided_code_hashed )
110+ provided_code_tokenized = Path (args .basepath , "provided_code" , "tokens.json" )
111+ provided_code_hashed = Path (args .basepath , "provided_code" , "hashes.txt" )
112+ hasher (lichen_config , lichen_run_config , provided_code_tokenized , provided_code_hashed )
105113
106114 # ==========================================================================
107115 end_time = time .time ()
0 commit comments