|
4 | 4 | "cell_type": "markdown", |
5 | 5 | "metadata": {}, |
6 | 6 | "source": [ |
7 | | - "# Content\n", |
| 7 | + "# Function Names suggestion\n", |
8 | 8 | "\n", |
| 9 | + "Today we are going to show how to:\n", |
9 | 10 | "* Extract function definitions\n", |
10 | 11 | "* Highlight names and identifiers in function\n", |
11 | | - "* Features and labels extraction\n", |
12 | | - "* Train BPE\n", |
13 | | - "* Prepare train & validation dataset for training seq2seq\n", |
14 | | - "* Train seq2seq model\n", |
| 12 | + "* extract features and labels\n", |
| 13 | + "* Train a tokenizer (BPE)\n", |
| 14 | + "* Prepare train & validation dataset for training a seq2seq model\n", |
| 15 | + "* Train seq2seq NMT model\n", |
15 | 16 | "* Prediction" |
16 | 17 | ] |
17 | 18 | }, |
|
39 | 40 | "from os.path import join as path_join\n", |
40 | 41 | "from typing import Union\n", |
41 | 42 | "\n", |
| 43 | + "coloredlogs.install(level=\"WARNING\")\n", |
| 44 | + "logging.getLogger(\"matplotlib.axes._base\").setLevel(logging.INFO)\n", |
| 45 | + "warnings.filterwarnings(\"ignore\")\n", |
| 46 | + "\n", |
| 47 | + "\n", |
42 | 48 | "class Files(FilesABC, Enum):\n", |
43 | 49 | " FUNCTIONS = [\"functions.jsonl.bz2\"]\n", |
44 | 50 | " FUNC_ID_NAME = [\"functions_identifers_names.pkl.bz2\"]\n", |
|
60 | 66 | " SAMPLE_ENC_VAL_BODIES = [\"sample_val.bpe.src\"]\n", |
61 | 67 | " SAMPLE_ENC_VAL_NAMES = [\"sample_val.bpe.tgt\"]\n", |
62 | 68 | "\n", |
63 | | - " \n", |
64 | 69 | "class Dirs(DirsABC, Enum):\n", |
65 | 70 | " TF_MODELS = [\"tf\", \"models\"]\n", |
66 | 71 | " MODEL_RUN = [\"model\", \"run\"]\n", |
67 | 72 | "\n", |
68 | | - "run = Run(\"name-suggestion\", \"java-full\")\n", |
| 73 | + " \n", |
| 74 | + "# Un-coment this at the end, to play with larger pre-processed data\n", |
| 75 | + "# run = Run(\"name-suggestion\", \"java-full\")\n", |
69 | 76 | "\n", |
70 | | - "coloredlogs.install(level=\"WARNING\")\n", |
71 | | - "logging.getLogger(\"matplotlib.axes._base\").setLevel(logging.INFO)\n", |
72 | | - "warnings.filterwarnings(\"ignore\")" |
| 77 | + "run = Run(\"name-suggestion\", \"java-small\")" |
73 | 78 | ] |
74 | 79 | }, |
75 | 80 | { |
|
268 | 273 | " del(df)\n", |
269 | 274 | "\n", |
270 | 275 | "\n", |
271 | | - "extract_functions_parallel(run.path(Files.FUNCTIONS))" |
| 276 | + "extract_functions_parallel(run.path(Files.FUNCTIONS), 3)" |
272 | 277 | ] |
273 | 278 | }, |
274 | 279 | { |
|
578 | 583 | " ! {cmd_gpu}" |
579 | 584 | ] |
580 | 585 | }, |
| 586 | + { |
| 587 | + "cell_type": "code", |
| 588 | + "execution_count": null, |
| 589 | + "metadata": {}, |
| 590 | + "outputs": [], |
| 591 | + "source": [ |
| 592 | + "!ls -la {model_dir}" |
| 593 | + ] |
| 594 | + }, |
581 | 595 | { |
582 | 596 | "cell_type": "markdown", |
583 | 597 | "metadata": {}, |
|
594 | 608 | "metadata": {}, |
595 | 609 | "outputs": [], |
596 | 610 | "source": [ |
597 | | - "# you have to specify location of pretrained model\n", |
598 | 611 | "pretrained_model = None\n", |
| 612 | + "\n", |
| 613 | + "# Put your checkoint number insteaf of XXX\n", |
| 614 | + "# Comment this, in oredr to use an already pre-trained model instead\n", |
| 615 | + "pretrained_model = \"{}/ckpt-0\".format(model_dir)\n", |
| 616 | + "\n", |
599 | 617 | "if pretrained_model is None:\n", |
600 | 618 | " pretrained_model = run.path(Files.MODEL_PRETRAINED)" |
601 | 619 | ] |
|
630 | 648 | "! {predict_cmd}" |
631 | 649 | ] |
632 | 650 | }, |
| 651 | + { |
| 652 | + "cell_type": "code", |
| 653 | + "execution_count": null, |
| 654 | + "metadata": {}, |
| 655 | + "outputs": [], |
| 656 | + "source": [ |
| 657 | + "!cat {run.path(Files.ENC_VAL_NAMES_PRED)}" |
| 658 | + ] |
| 659 | + }, |
633 | 660 | { |
634 | 661 | "cell_type": "code", |
635 | 662 | "execution_count": null, |
|
681 | 708 | "# Quality\n", |
682 | 709 | "\n", |
683 | 710 | "This is a very simplistic base line model, wich misses a lot of context information to make a decidions:\n", |
684 | | - "* roles of identifiers ()\n", |
| 711 | + "* roles of identifiers\n", |
685 | 712 | "* structural information \n", |
686 | 713 | "* arguments to function\n", |
687 | 714 | "\n", |
688 | | - "Many more improvements were proposed recently [code2vec](https://github.com/tech-srl/code2vec), [GGNNs]().\n", |
| 715 | + "Many more improvements were proposed recently [code2vec](https://github.com/tech-srl/code2vec), [GGNNs](). etc.\n", |
689 | 716 | "\n", |
690 | | - "For" |
| 717 | + "Check [github.com/src-d/awesome-machine-learning-on-source-code](https://github.com/src-d/awesome-machine-learning-on-source-code) to learn about State Of the Art (SOtA) models." |
691 | 718 | ] |
692 | 719 | } |
693 | 720 | ], |
|
0 commit comments