Skip to content

Commit 69f2f11

Browse files
committed
Update 5-26-2021
1 parent cda2e62 commit 69f2f11

File tree

11 files changed

+1912
-606
lines changed

11 files changed

+1912
-606
lines changed

notebooks/Project_CodeNet_LangClass.ipynb

Lines changed: 872 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/Project_CodeNet_MLM.ipynb

Lines changed: 815 additions & 0 deletions
Large diffs are not rendered by default.

notebooks/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Example notebooks for Project CodeNet
2+
3+
This directory contains Jupyter notebooks that show how to use the Project
4+
CodeNet dataset in several example applications.

tools/spt-generator/examples/demos/c/helloworld.json

Lines changed: 77 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,122 +2,193 @@
22
{
33
"graph":{
44
"version":"1.0",
5+
"src-file":"./helloworld.c",
56
"type":"tree",
67
"directed":true,
78
"order":"bfs",
9+
"num-of-nodes":17,
10+
"num-of-edges":16,
811
"root":0,
912
"nodes":[
1013
{
1114
"id":0,
1215
"label":"##",
1316
"node-type":"Rule",
14-
"type-rule-name":"compilationUnit"
17+
"type-rule-name":"compilationUnit",
18+
"type-rule-index":82,
19+
"reserved-word-flag":false,
20+
"dfs-index":0,
21+
"depth":0
1522
},
1623
{
1724
"id":1,
1825
"label":"##",
1926
"node-type":"Rule",
20-
"type-rule-name":"functionDefinition"
27+
"type-rule-name":"functionDefinition",
28+
"type-rule-index":85,
29+
"reserved-word-flag":false,
30+
"dfs-index":1,
31+
"depth":1
2132
},
2233
{
2334
"id":2,
2435
"label":"<EOF>",
2536
"node-type":"Token",
2637
"type-rule-name":"EOF",
38+
"type-rule-index":-1,
39+
"reserved-word-flag":false,
40+
"dfs-index":16,
41+
"depth":1,
2742
"token-id":11
2843
},
2944
{
3045
"id":3,
3146
"label":"#()",
3247
"node-type":"Rule",
33-
"type-rule-name":"directDeclarator"
48+
"type-rule-name":"directDeclarator",
49+
"type-rule-index":48,
50+
"reserved-word-flag":false,
51+
"dfs-index":2,
52+
"depth":2
3453
},
3554
{
3655
"id":4,
3756
"label":"{#}",
3857
"node-type":"Rule",
39-
"type-rule-name":"compoundStatement"
58+
"type-rule-name":"compoundStatement",
59+
"type-rule-index":72,
60+
"reserved-word-flag":false,
61+
"dfs-index":6,
62+
"depth":2
4063
},
4164
{
4265
"id":5,
4366
"label":"main",
4467
"node-type":"Token",
4568
"type-rule-name":"Identifier",
69+
"type-rule-index":105,
70+
"reserved-word-flag":false,
71+
"dfs-index":3,
72+
"depth":3,
4673
"token-id":1
4774
},
4875
{
4976
"id":6,
5077
"label":"(",
5178
"node-type":"Token",
5279
"type-rule-name":"'('",
80+
"type-rule-index":59,
81+
"reserved-word-flag":true,
82+
"dfs-index":4,
83+
"depth":3,
5384
"token-id":2
5485
},
5586
{
5687
"id":7,
5788
"label":")",
5889
"node-type":"Token",
5990
"type-rule-name":"')'",
91+
"type-rule-index":60,
92+
"reserved-word-flag":true,
93+
"dfs-index":5,
94+
"depth":3,
6095
"token-id":3
6196
},
6297
{
6398
"id":8,
6499
"label":"{",
65100
"node-type":"Token",
66101
"type-rule-name":"'{'",
102+
"type-rule-index":63,
103+
"reserved-word-flag":true,
104+
"dfs-index":7,
105+
"depth":3,
67106
"token-id":4
68107
},
69108
{
70109
"id":9,
71110
"label":"#;",
72111
"node-type":"Rule",
73-
"type-rule-name":"expressionStatement"
112+
"type-rule-name":"expressionStatement",
113+
"type-rule-index":75,
114+
"reserved-word-flag":false,
115+
"dfs-index":8,
116+
"depth":3
74117
},
75118
{
76119
"id":10,
77120
"label":"}",
78121
"node-type":"Token",
79122
"type-rule-name":"'}'",
123+
"type-rule-index":64,
124+
"reserved-word-flag":true,
125+
"dfs-index":15,
126+
"depth":3,
80127
"token-id":10
81128
},
82129
{
83130
"id":11,
84131
"label":"#(#)",
85132
"node-type":"Rule",
86-
"type-rule-name":"postfixExpression"
133+
"type-rule-name":"postfixExpression",
134+
"type-rule-index":4,
135+
"reserved-word-flag":false,
136+
"dfs-index":9,
137+
"depth":4
87138
},
88139
{
89140
"id":12,
90141
"label":";",
91142
"node-type":"Token",
92143
"type-rule-name":"';'",
144+
"type-rule-index":87,
145+
"reserved-word-flag":true,
146+
"dfs-index":14,
147+
"depth":4,
93148
"token-id":9
94149
},
95150
{
96151
"id":13,
97152
"label":"printf",
98153
"node-type":"Token",
99154
"type-rule-name":"Identifier",
155+
"type-rule-index":105,
156+
"reserved-word-flag":false,
157+
"dfs-index":10,
158+
"depth":5,
100159
"token-id":5
101160
},
102161
{
103162
"id":14,
104163
"label":"(",
105164
"node-type":"Token",
106165
"type-rule-name":"'('",
166+
"type-rule-index":59,
167+
"reserved-word-flag":true,
168+
"dfs-index":11,
169+
"depth":5,
107170
"token-id":6
108171
},
109172
{
110173
"id":15,
111174
"label":"\"Hello World\"",
112175
"node-type":"Token",
113176
"type-rule-name":"StringLiteral",
177+
"type-rule-index":108,
178+
"reserved-word-flag":false,
179+
"dfs-index":12,
180+
"depth":5,
114181
"token-id":7
115182
},
116183
{
117184
"id":16,
118185
"label":")",
119186
"node-type":"Token",
120187
"type-rule-name":"')'",
188+
"type-rule-index":60,
189+
"reserved-word-flag":true,
190+
"dfs-index":13,
191+
"depth":5,
121192
"token-id":8
122193
}
123194
],

tools/tokenizer/Makefile

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,19 @@ PROGS = tokenize antlr4tojson pytokenize jstokenize
1212
all: $(PROGS)
1313

1414
tokenize: tokenize.o
15-
tokenize.o: tokenize.c cpp_keywords.h java_keywords.h
15+
tokenize.o: tokenize.c
16+
1617
antlr4tojson: antlr4tojson.o
1718
antlr4tojson.o: antlr4tojson.c
19+
1820
pytokenize: pytokenize.o token_common.o
1921
pytokenize.o: pytokenize.c token_common.h
22+
2023
jstokenize: jstokenize.o token_common.o
2124
jstokenize.o: jstokenize.c token_common.h
2225

2326
token_common.o: token_common.c token_common.h
2427

25-
cpp_keywords.h: c++20.kw
26-
gperf -LC -Nis_cpp_keyword -Hcpp_hash -c -C -I -m1 --output-file=$@ $<
27-
28-
java_keywords.h: java.kw
29-
gperf -LC -Nis_java_keyword -Hjava_hash -c -C -I -m1 --output-file=$@ $<
30-
# must change some global names:
31-
sed -i \
32-
-e 's/TOTAL_KEYWORDS/JAVA_TOTAL_KEYWORDS/g' \
33-
-e 's/MIN_WORD_LENGTH/JAVA_MIN_WORD_LENGTH/g' \
34-
-e 's/MAX_WORD_LENGTH/JAVA_MAX_WORD_LENGTH/g' \
35-
-e 's/MIN_HASH_VALUE/JAVA_MIN_HASH_VALUE/g' \
36-
-e 's/MAX_HASH_VALUE/JAVA_MAX_HASH_VALUE/g' $@
37-
3828
.PHONY: clean
3929
clean:
4030
@-rm -f *.o

tools/tokenizer/README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,18 @@ A tokenizer for C/C++ (and Java) source code with output in 6 formats.
6868
Recognizes the following token classes: keyword, identifier, integer,
6969
floating, string, character, operator, and preprocessor.
7070

71-
usage: tokenize [ -1cdhjl:m:no:rsvw ] [ FILES ]
71+
usage: tokenize [ -1acdhjl:m:no:rsvw ] [ FILES ]
7272

7373
Command line options are:
74+
-a : append to output file instead of create or overwrite.
7475
-c : treat a # character as the start of a line comment.
7576
-d : print debug info to stderr; implies -v.
7677
-h : print just this text to stderr and stop.
7778
-j : assume input is Java (deprecated: use -l Java or .java).
7879
-l<lang> : specify language explicitly (C, C++, Java).
7980
-m<mode> : output mode either plain (default), csv, json, jsonl, xml, or raw.
8081
-n : output newlines as a special pseudo token.
81-
-o<file> : name for output file (instead of stdout).
82+
-o<file> : write output to this file (instead of stdout).
8283
-s : enable a special start token specifying the filename.
8384
-1 : treat all filename arguments as a continuous single input.
8485
-v : print action summary to stderr.

tools/tokenizer/c++20.kw

Lines changed: 0 additions & 95 deletions
This file was deleted.

0 commit comments

Comments
 (0)