Skip to content

Commit 077bb06

Browse files
committed
names: latest notebook used for workshop
Signed-off-by: Alexander Bezzubov <[email protected]>
1 parent 7e92d83 commit 077bb06

File tree

1 file changed

+42
-15
lines changed

1 file changed

+42
-15
lines changed

notebooks/Name suggestion.ipynb

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,15 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# Content\n",
7+
"# Function Names suggestion\n",
88
"\n",
9+
"Today we are going to show how to:\n",
910
"* Extract function definitions\n",
1011
"* Highlight names and identifiers in function\n",
11-
"* Features and labels extraction\n",
12-
"* Train BPE\n",
13-
"* Prepare train & validation dataset for training seq2seq\n",
14-
"* Train seq2seq model\n",
12+
"* extract features and labels\n",
13+
"* Train a tokenizer (BPE)\n",
14+
"* Prepare train & validation dataset for training a seq2seq model\n",
15+
"* Train seq2seq NMT model\n",
1516
"* Prediction"
1617
]
1718
},
@@ -39,6 +40,11 @@
3940
"from os.path import join as path_join\n",
4041
"from typing import Union\n",
4142
"\n",
43+
"coloredlogs.install(level=\"WARNING\")\n",
44+
"logging.getLogger(\"matplotlib.axes._base\").setLevel(logging.INFO)\n",
45+
"warnings.filterwarnings(\"ignore\")\n",
46+
"\n",
47+
"\n",
4248
"class Files(FilesABC, Enum):\n",
4349
" FUNCTIONS = [\"functions.jsonl.bz2\"]\n",
4450
" FUNC_ID_NAME = [\"functions_identifers_names.pkl.bz2\"]\n",
@@ -60,16 +66,15 @@
6066
" SAMPLE_ENC_VAL_BODIES = [\"sample_val.bpe.src\"]\n",
6167
" SAMPLE_ENC_VAL_NAMES = [\"sample_val.bpe.tgt\"]\n",
6268
"\n",
63-
" \n",
6469
"class Dirs(DirsABC, Enum):\n",
6570
" TF_MODELS = [\"tf\", \"models\"]\n",
6671
" MODEL_RUN = [\"model\", \"run\"]\n",
6772
"\n",
68-
"run = Run(\"name-suggestion\", \"java-full\")\n",
73+
" \n",
74+
"# Un-coment this at the end, to play with larger pre-processed data\n",
75+
"# run = Run(\"name-suggestion\", \"java-full\")\n",
6976
"\n",
70-
"coloredlogs.install(level=\"WARNING\")\n",
71-
"logging.getLogger(\"matplotlib.axes._base\").setLevel(logging.INFO)\n",
72-
"warnings.filterwarnings(\"ignore\")"
77+
"run = Run(\"name-suggestion\", \"java-small\")"
7378
]
7479
},
7580
{
@@ -268,7 +273,7 @@
268273
" del(df)\n",
269274
"\n",
270275
"\n",
271-
"extract_functions_parallel(run.path(Files.FUNCTIONS))"
276+
"extract_functions_parallel(run.path(Files.FUNCTIONS), 3)"
272277
]
273278
},
274279
{
@@ -578,6 +583,15 @@
578583
" ! {cmd_gpu}"
579584
]
580585
},
586+
{
587+
"cell_type": "code",
588+
"execution_count": null,
589+
"metadata": {},
590+
"outputs": [],
591+
"source": [
592+
"!ls -la {model_dir}"
593+
]
594+
},
581595
{
582596
"cell_type": "markdown",
583597
"metadata": {},
@@ -594,8 +608,12 @@
594608
"metadata": {},
595609
"outputs": [],
596610
"source": [
597-
"# you have to specify location of pretrained model\n",
598611
"pretrained_model = None\n",
612+
"\n",
613+
"# Put your checkoint number insteaf of XXX\n",
614+
"# Comment this, in oredr to use an already pre-trained model instead\n",
615+
"pretrained_model = \"{}/ckpt-0\".format(model_dir)\n",
616+
"\n",
599617
"if pretrained_model is None:\n",
600618
" pretrained_model = run.path(Files.MODEL_PRETRAINED)"
601619
]
@@ -630,6 +648,15 @@
630648
"! {predict_cmd}"
631649
]
632650
},
651+
{
652+
"cell_type": "code",
653+
"execution_count": null,
654+
"metadata": {},
655+
"outputs": [],
656+
"source": [
657+
"!cat {run.path(Files.ENC_VAL_NAMES_PRED)}"
658+
]
659+
},
633660
{
634661
"cell_type": "code",
635662
"execution_count": null,
@@ -681,13 +708,13 @@
681708
"# Quality\n",
682709
"\n",
683710
"This is a very simplistic base line model, wich misses a lot of context information to make a decidions:\n",
684-
"* roles of identifiers ()\n",
711+
"* roles of identifiers\n",
685712
"* structural information \n",
686713
"* arguments to function\n",
687714
"\n",
688-
"Many more improvements were proposed recently [code2vec](https://github.com/tech-srl/code2vec), [GGNNs]().\n",
715+
"Many more improvements were proposed recently [code2vec](https://github.com/tech-srl/code2vec), [GGNNs](). etc.\n",
689716
"\n",
690-
"For"
717+
"Check [github.com/src-d/awesome-machine-learning-on-source-code](https://github.com/src-d/awesome-machine-learning-on-source-code) to learn about State Of the Art (SOtA) models."
691718
]
692719
}
693720
],

0 commit comments

Comments
 (0)