Skip to content

Commit eb8259b

Browse files
[Refactor:Plagiarism] Increase tokenizer specificity / performance (#57)
* Improve tokenizers by including token value in type for operator tokens * Update tests * Remove unnecessary debugging statement
1 parent 1f1927b commit eb8259b

File tree

5 files changed

+212
-208
lines changed

5 files changed

+212
-208
lines changed

bin/tokenize_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def main():
4141
start_time = time.time()
4242
args = parse_args()
4343

44-
print("TOKENIZE ALL...", end="")
44+
print("TOKENIZE ALL...", end="", flush=True)
4545

4646
with open(os.path.join(args.basepath, "config.json")) as lichen_config:
4747
lichen_config_data = json.load(lichen_config)

tests/data/tokenizer/c/expected_output/output.json

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
{
33
"char": 1,
44
"line": 1,
5-
"type": "PUNCTUATION",
5+
"type": "PUNCTUATION-#",
66
"value": "#"
77
},
88
{
@@ -14,7 +14,7 @@
1414
{
1515
"char": 10,
1616
"line": 1,
17-
"type": "PUNCTUATION",
17+
"type": "PUNCTUATION-<",
1818
"value": "<"
1919
},
2020
{
@@ -26,7 +26,7 @@
2626
{
2727
"char": 19,
2828
"line": 1,
29-
"type": "PUNCTUATION",
29+
"type": "PUNCTUATION->",
3030
"value": ">"
3131
},
3232
{
@@ -50,7 +50,7 @@
5050
{
5151
"char": 20,
5252
"line": 2,
53-
"type": "PUNCTUATION",
53+
"type": "PUNCTUATION-;",
5454
"value": ";"
5555
},
5656
{
@@ -68,19 +68,19 @@
6868
{
6969
"char": 9,
7070
"line": 4,
71-
"type": "PUNCTUATION",
71+
"type": "PUNCTUATION-(",
7272
"value": "("
7373
},
7474
{
7575
"char": 10,
7676
"line": 4,
77-
"type": "PUNCTUATION",
77+
"type": "PUNCTUATION-)",
7878
"value": ")"
7979
},
8080
{
8181
"char": 1,
8282
"line": 5,
83-
"type": "PUNCTUATION",
83+
"type": "PUNCTUATION-{",
8484
"value": "{"
8585
},
8686
{
@@ -104,7 +104,7 @@
104104
{
105105
"char": 19,
106106
"line": 6,
107-
"type": "PUNCTUATION",
107+
"type": "PUNCTUATION-;",
108108
"value": ";"
109109
},
110110
{
@@ -134,7 +134,7 @@
134134
{
135135
"char": 34,
136136
"line": 7,
137-
"type": "PUNCTUATION",
137+
"type": "PUNCTUATION-=",
138138
"value": "="
139139
},
140140
{
@@ -146,7 +146,7 @@
146146
{
147147
"char": 37,
148148
"line": 7,
149-
"type": "PUNCTUATION",
149+
"type": "PUNCTUATION-;",
150150
"value": ";"
151151
},
152152
{
@@ -158,7 +158,7 @@
158158
{
159159
"char": 10,
160160
"line": 9,
161-
"type": "PUNCTUATION",
161+
"type": "PUNCTUATION-<<",
162162
"value": "<<"
163163
},
164164
{
@@ -170,7 +170,7 @@
170170
{
171171
"char": 41,
172172
"line": 9,
173-
"type": "PUNCTUATION",
173+
"type": "PUNCTUATION-;",
174174
"value": ";"
175175
},
176176
{
@@ -182,7 +182,7 @@
182182
{
183183
"char": 9,
184184
"line": 10,
185-
"type": "PUNCTUATION",
185+
"type": "PUNCTUATION->>",
186186
"value": ">>"
187187
},
188188
{
@@ -194,7 +194,7 @@
194194
{
195195
"char": 13,
196196
"line": 10,
197-
"type": "PUNCTUATION",
197+
"type": "PUNCTUATION-;",
198198
"value": ";"
199199
},
200200
{
@@ -206,7 +206,7 @@
206206
{
207207
"char": 8,
208208
"line": 12,
209-
"type": "PUNCTUATION",
209+
"type": "PUNCTUATION-(",
210210
"value": "("
211211
},
212212
{
@@ -224,7 +224,7 @@
224224
{
225225
"char": 15,
226226
"line": 12,
227-
"type": "PUNCTUATION",
227+
"type": "PUNCTUATION-=",
228228
"value": "="
229229
},
230230
{
@@ -236,7 +236,7 @@
236236
{
237237
"char": 18,
238238
"line": 12,
239-
"type": "PUNCTUATION",
239+
"type": "PUNCTUATION-;",
240240
"value": ";"
241241
},
242242
{
@@ -248,7 +248,7 @@
248248
{
249249
"char": 22,
250250
"line": 12,
251-
"type": "PUNCTUATION",
251+
"type": "PUNCTUATION-<=",
252252
"value": "<="
253253
},
254254
{
@@ -260,13 +260,13 @@
260260
{
261261
"char": 25,
262262
"line": 12,
263-
"type": "PUNCTUATION",
263+
"type": "PUNCTUATION-;",
264264
"value": ";"
265265
},
266266
{
267267
"char": 27,
268268
"line": 12,
269-
"type": "PUNCTUATION",
269+
"type": "PUNCTUATION-++",
270270
"value": "++"
271271
},
272272
{
@@ -278,13 +278,13 @@
278278
{
279279
"char": 30,
280280
"line": 12,
281-
"type": "PUNCTUATION",
281+
"type": "PUNCTUATION-)",
282282
"value": ")"
283283
},
284284
{
285285
"char": 5,
286286
"line": 13,
287-
"type": "PUNCTUATION",
287+
"type": "PUNCTUATION-{",
288288
"value": "{"
289289
},
290290
{
@@ -296,7 +296,7 @@
296296
{
297297
"char": 19,
298298
"line": 14,
299-
"type": "PUNCTUATION",
299+
"type": "PUNCTUATION-*=",
300300
"value": "*="
301301
},
302302
{
@@ -308,13 +308,13 @@
308308
{
309309
"char": 23,
310310
"line": 14,
311-
"type": "PUNCTUATION",
311+
"type": "PUNCTUATION-;",
312312
"value": ";"
313313
},
314314
{
315315
"char": 5,
316316
"line": 15,
317-
"type": "PUNCTUATION",
317+
"type": "PUNCTUATION-}",
318318
"value": "}"
319319
},
320320
{
@@ -326,7 +326,7 @@
326326
{
327327
"char": 10,
328328
"line": 17,
329-
"type": "PUNCTUATION",
329+
"type": "PUNCTUATION-<<",
330330
"value": "<<"
331331
},
332332
{
@@ -338,7 +338,7 @@
338338
{
339339
"char": 29,
340340
"line": 17,
341-
"type": "PUNCTUATION",
341+
"type": "PUNCTUATION-<<",
342342
"value": "<<"
343343
},
344344
{
@@ -350,7 +350,7 @@
350350
{
351351
"char": 34,
352352
"line": 17,
353-
"type": "PUNCTUATION",
353+
"type": "PUNCTUATION-<<",
354354
"value": "<<"
355355
},
356356
{
@@ -362,7 +362,7 @@
362362
{
363363
"char": 43,
364364
"line": 17,
365-
"type": "PUNCTUATION",
365+
"type": "PUNCTUATION-<<",
366366
"value": "<<"
367367
},
368368
{
@@ -374,7 +374,7 @@
374374
{
375375
"char": 55,
376376
"line": 17,
377-
"type": "PUNCTUATION",
377+
"type": "PUNCTUATION-;",
378378
"value": ";"
379379
},
380380
{
@@ -392,13 +392,13 @@
392392
{
393393
"char": 13,
394394
"line": 18,
395-
"type": "PUNCTUATION",
395+
"type": "PUNCTUATION-;",
396396
"value": ";"
397397
},
398398
{
399399
"char": 1,
400400
"line": 19,
401-
"type": "PUNCTUATION",
401+
"type": "PUNCTUATION-}",
402402
"value": "}"
403403
}
404-
]
404+
]

0 commit comments

Comments
 (0)