-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy path4-generate_grams.sh
More file actions
28 lines (24 loc) · 805 Bytes
/
4-generate_grams.sh
File metadata and controls
28 lines (24 loc) · 805 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/usr/bin/bash
WORKING_DIR=$(pwd)
DATA_DIR=$WORKING_DIR/data
GRAMS_DIR=$DATA_DIR/grams
EN_CLEAN_DATA_DIR=$DATA_DIR/cleaned
if [ ! -d $GRAMS_DIR ]; then
mkdir -p $GRAMS_DIR
fi
# Generate unigrams
python generate_unigrams.py | grep "^[a-zA-Z][a-zA-Z]\+-\?[a-z]*" | sort -k 2 -t ',' > $GRAMS_DIR/unigrams.csv
# Generate bigrams
for f in $(ls $EN_CLEAN_DATA_DIR);
do
# Get the components of the file name separated by .
FC=(${f//./ })
python $WORKING_DIR/generate_grams.py -i $EN_CLEAN_DATA_DIR/$f -o $GRAMS_DIR/bigrams-${FC[1]}.csv -n 2
done
# Generate trigrams
for f in $(ls $EN_CLEAN_DATA_DIR);
do
# Get the components of the file name separated by .
FC=(${f//./ })
python $WORKING_DIR/generate_grams.py -i $EN_CLEAN_DATA_DIR/$f -o $GRAMS_DIR/trigrams-${FC[1]}.csv -n 3
done