Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,18 @@
train_size = total_size - val_size - test_size

if not args.quiet:
print 'Total vocabulary size: %d' % len(token_to_idx)
print 'Total tokens in file: %d' % total_size
print ' Training size: %d' % train_size
print ' Val size: %d' % val_size
print ' Test size: %d' % test_size
print('Total vocabulary size: %d' % len(token_to_idx))
print('Total tokens in file: %d' % total_size)
print(' Training size: %d' % train_size)
print(' Val size: %d' % val_size)
print(' Test size: %d' % test_size)

# Choose the datatype based on the vocabulary size
dtype = np.uint8
if len(token_to_idx) > 255:
dtype = np.uint32
if not args.quiet:
print 'Using dtype ', dtype
print('Using dtype ', dtype)

# Just load data into memory ... we'll have to do something more clever
# for huge datasets but this should be fine for now
Expand Down Expand Up @@ -87,7 +87,7 @@
# Dump a JSON file for the vocab
json_data = {
'token_to_idx': token_to_idx,
'idx_to_token': {v: k for k, v in token_to_idx.iteritems()},
'idx_to_token': {v: k for k, v in token_to_idx.items()},
}
with open(args.output_json, 'w') as f:
json.dump(json_data, f)