File tree Expand file tree Collapse file tree 9 files changed +1167
-0
lines changed Expand file tree Collapse file tree 9 files changed +1167
-0
lines changed Original file line number Diff line number Diff line change 1+ * ~
Original file line number Diff line number Diff line change 1+ #! /usr/bin/env bash
2+
3+ src_location=" ."
4+ build_location=" ."
5+ bin_location=" ./bin"
6+
7+ nlohmann_dir=${src_location} /GIT_NLOHMANN_JSON/
8+
9+ if [ ! -d " ${nlohmann_dir} " ]; then
10+ echo ' should install'
11+ git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir}
12+ fi
13+
14+
15+ mkdir -p ${bin_location}
16+ clang++ -I ${nlohmann_dir} /include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location} /plaintext_tokenizer.out
17+
18+ ${bin_location} /plaintext_tokenizer.out < tokenizer/plaintext/input.txt > output.json
19+ ${bin_location} /plaintext_tokenizer.out --ignore_newlines < tokenizer/plaintext/input.txt > output_ignore_newlines.json
20+ ${bin_location} /plaintext_tokenizer.out --to_lower < tokenizer/plaintext/input.txt > output_to_lower.json
21+ ${bin_location} /plaintext_tokenizer.out --ignore_punctuation < tokenizer/plaintext/input.txt > output_ignore_punctuation.json
22+ ${bin_location} /plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json
23+
24+
25+
Original file line number Diff line number Diff line change 1+ [
2+ {
3+ "char" : 5 ,
4+ "line" : 1 ,
5+ "type" : " string" ,
6+ "value" : " A"
7+ },
8+ {
9+ "char" : 7 ,
10+ "line" : 1 ,
11+ "type" : " string" ,
12+ "value" : " Sample"
13+ },
14+ {
15+ "char" : 14 ,
16+ "line" : 1 ,
17+ "type" : " string" ,
18+ "value" : " File"
19+ },
20+ {
21+ "char" : 18 ,
22+ "line" : 1 ,
23+ "type" : " newline" ,
24+ "value" : " \n "
25+ },
26+ {
27+ "char" : 1 ,
28+ "line" : 2 ,
29+ "type" : " newline" ,
30+ "value" : " \n "
31+ },
32+ {
33+ "char" : 1 ,
34+ "line" : 3 ,
35+ "type" : " string" ,
36+ "value" : " This"
37+ },
38+ {
39+ "char" : 6 ,
40+ "line" : 3 ,
41+ "type" : " string" ,
42+ "value" : " file"
43+ },
44+ {
45+ "char" : 11 ,
46+ "line" : 3 ,
47+ "type" : " string" ,
48+ "value" : " contains"
49+ },
50+ {
51+ "char" : 20 ,
52+ "line" : 3 ,
53+ "type" : " number" ,
54+ "value" : " 1"
55+ },
56+ {
57+ "char" : 22 ,
58+ "line" : 3 ,
59+ "type" : " punctuation" ,
60+ "value" : " \" "
61+ },
62+ {
63+ "char" : 23 ,
64+ "line" : 3 ,
65+ "type" : " string" ,
66+ "value" : " sample"
67+ },
68+ {
69+ "char" : 30 ,
70+ "line" : 3 ,
71+ "type" : " string" ,
72+ "value" : " of"
73+ },
74+ {
75+ "char" : 33 ,
76+ "line" : 3 ,
77+ "type" : " string" ,
78+ "value" : " plaintext"
79+ },
80+ {
81+ "char" : 42 ,
82+ "line" : 3 ,
83+ "type" : " punctuation" ,
84+ "value" : " \" "
85+ },
86+ {
87+ "char" : 43 ,
88+ "line" : 3 ,
89+ "type" : " punctuation" ,
90+ "value" : " ."
91+ },
92+ {
93+ "char" : 46 ,
94+ "line" : 3 ,
95+ "type" : " string" ,
96+ "value" : " We"
97+ },
98+ {
99+ "char" : 48 ,
100+ "line" : 3 ,
101+ "type" : " newline" ,
102+ "value" : " \n "
103+ },
104+ {
105+ "char" : 1 ,
106+ "line" : 4 ,
107+ "type" : " string" ,
108+ "value" : " can"
109+ },
110+ {
111+ "char" : 5 ,
112+ "line" : 4 ,
113+ "type" : " string" ,
114+ "value" : " tokenize"
115+ },
116+ {
117+ "char" : 14 ,
118+ "line" : 4 ,
119+ "type" : " string" ,
120+ "value" : " THIS"
121+ },
122+ {
123+ "char" : 18 ,
124+ "line" : 4 ,
125+ "type" : " punctuation" ,
126+ "value" : " ."
127+ },
128+ {
129+ "char" : 21 ,
130+ "line" : 4 ,
131+ "type" : " string" ,
132+ "value" : " a"
133+ },
134+ {
135+ "char" : 22 ,
136+ "line" : 4 ,
137+ "type" : " punctuation" ,
138+ "value" : " ."
139+ },
140+ {
141+ "char" : 23 ,
142+ "line" : 4 ,
143+ "type" : " string" ,
144+ "value" : " b"
145+ },
146+ {
147+ "char" : 24 ,
148+ "line" : 4 ,
149+ "type" : " punctuation" ,
150+ "value" : " ."
151+ },
152+ {
153+ "char" : 25 ,
154+ "line" : 4 ,
155+ "type" : " string" ,
156+ "value" : " c"
157+ },
158+ {
159+ "char" : 26 ,
160+ "line" : 4 ,
161+ "type" : " number" ,
162+ "value" : " 1"
163+ },
164+ {
165+ "char" : 27 ,
166+ "line" : 4 ,
167+ "type" : " string" ,
168+ "value" : " d"
169+ },
170+ {
171+ "char" : 28 ,
172+ "line" : 4 ,
173+ "type" : " number" ,
174+ "value" : " 2"
175+ },
176+ {
177+ "char" : 29 ,
178+ "line" : 4 ,
179+ "type" : " string" ,
180+ "value" : " e"
181+ },
182+ {
183+ "char" : 30 ,
184+ "line" : 4 ,
185+ "type" : " punctuation" ,
186+ "value" : " !"
187+ },
188+ {
189+ "char" : 31 ,
190+ "line" : 4 ,
191+ "type" : " newline" ,
192+ "value" : " \n "
193+ },
194+ {
195+ "char" : 1 ,
196+ "line" : 5 ,
197+ "type" : " string" ,
198+ "value" : " Good"
199+ },
200+ {
201+ "char" : 5 ,
202+ "line" : 5 ,
203+ "type" : " punctuation" ,
204+ "value" : " -"
205+ },
206+ {
207+ "char" : 6 ,
208+ "line" : 5 ,
209+ "type" : " string" ,
210+ "value" : " bye"
211+ },
212+ {
213+ "char" : 9 ,
214+ "line" : 5 ,
215+ "type" : " punctuation" ,
216+ "value" : " ."
217+ },
218+ {
219+ "char" : 10 ,
220+ "line" : 5 ,
221+ "type" : " newline" ,
222+ "value" : " \n "
223+ },
224+ {
225+ "char" : 1 ,
226+ "line" : 6 ,
227+ "type" : " newline" ,
228+ "value" : " \n "
229+ }
230+ ]
Original file line number Diff line number Diff line change 1+ [
2+ {
3+ "char" : 5 ,
4+ "line" : 1 ,
5+ "type" : " string" ,
6+ "value" : " a"
7+ },
8+ {
9+ "char" : 7 ,
10+ "line" : 1 ,
11+ "type" : " string" ,
12+ "value" : " sample"
13+ },
14+ {
15+ "char" : 14 ,
16+ "line" : 1 ,
17+ "type" : " string" ,
18+ "value" : " file"
19+ },
20+ {
21+ "char" : 1 ,
22+ "line" : 3 ,
23+ "type" : " string" ,
24+ "value" : " this"
25+ },
26+ {
27+ "char" : 6 ,
28+ "line" : 3 ,
29+ "type" : " string" ,
30+ "value" : " file"
31+ },
32+ {
33+ "char" : 11 ,
34+ "line" : 3 ,
35+ "type" : " string" ,
36+ "value" : " contains"
37+ },
38+ {
39+ "char" : 23 ,
40+ "line" : 3 ,
41+ "type" : " string" ,
42+ "value" : " sample"
43+ },
44+ {
45+ "char" : 30 ,
46+ "line" : 3 ,
47+ "type" : " string" ,
48+ "value" : " of"
49+ },
50+ {
51+ "char" : 33 ,
52+ "line" : 3 ,
53+ "type" : " string" ,
54+ "value" : " plaintext"
55+ },
56+ {
57+ "char" : 46 ,
58+ "line" : 3 ,
59+ "type" : " string" ,
60+ "value" : " we"
61+ },
62+ {
63+ "char" : 1 ,
64+ "line" : 4 ,
65+ "type" : " string" ,
66+ "value" : " can"
67+ },
68+ {
69+ "char" : 5 ,
70+ "line" : 4 ,
71+ "type" : " string" ,
72+ "value" : " tokenize"
73+ },
74+ {
75+ "char" : 14 ,
76+ "line" : 4 ,
77+ "type" : " string" ,
78+ "value" : " this"
79+ },
80+ {
81+ "char" : 21 ,
82+ "line" : 4 ,
83+ "type" : " string" ,
84+ "value" : " a"
85+ },
86+ {
87+ "char" : 23 ,
88+ "line" : 4 ,
89+ "type" : " string" ,
90+ "value" : " b"
91+ },
92+ {
93+ "char" : 25 ,
94+ "line" : 4 ,
95+ "type" : " string" ,
96+ "value" : " c"
97+ },
98+ {
99+ "char" : 27 ,
100+ "line" : 4 ,
101+ "type" : " string" ,
102+ "value" : " d"
103+ },
104+ {
105+ "char" : 29 ,
106+ "line" : 4 ,
107+ "type" : " string" ,
108+ "value" : " e"
109+ },
110+ {
111+ "char" : 1 ,
112+ "line" : 5 ,
113+ "type" : " string" ,
114+ "value" : " good"
115+ },
116+ {
117+ "char" : 6 ,
118+ "line" : 5 ,
119+ "type" : " string" ,
120+ "value" : " bye"
121+ }
122+ ]
You can’t perform that action at this time.
0 commit comments