Skip to content

Commit e03e8ad

Browse files
authored
Add files via upload
1 parent fecde44 commit e03e8ad

31 files changed

+9494
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python
2+
# coding=utf-8
3+
#
4+
# @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
5+
# @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
import json
19+
import os
20+
21+
def get_api_name_4_api_sign_and_desps(library_name: str, base_dir: str):
22+
"""
23+
According to library_name, get all the API info of this library in the format shown in the following format.
24+
"""
25+
# load the library_name's all api info
26+
# base_dir = "/mnt/v-dzan/datasets/CERT/PrivateLibrary/Train"
27+
base_dir = os.path.join(base_dir, "PrivateLibrary", "Train")
28+
library_path = os.path.join(base_dir, library_name, f"{library_name}_apis_doc_details.jsonl")
29+
30+
library_apis_reader = open(library_path, "r")
31+
api_name_4_api_sign_and_desps = {}
32+
# The api_name_4_api_sign_and_desps format is:
33+
# {
34+
# "api_name": {
35+
# api_path1: [api_sign1, api_desp1],
36+
# api_path2: [api_sign2, api_desp2],
37+
# ...
38+
# }
39+
# ...
40+
# }
41+
for line in library_apis_reader:
42+
api_info = json.loads(line)
43+
# (['api_path', 'api_name', 'api_doc', 'api_signature', 'api_description', 'api_parameters', 'api_parameters_number', 'api_returns', 'api_see_also', 'api_notes', 'api_examples'])
44+
api_path = api_info["api_path"]
45+
api_name = api_info["api_name"]
46+
api_signature = api_info["api_signature"]
47+
api_description = api_info["api_description"]
48+
tmp_api_path_api_info = {api_path: [api_signature, api_description]}
49+
if api_name_4_api_sign_and_desps.get(api_name) is None:
50+
api_name_4_api_sign_and_desps[api_name] = tmp_api_path_api_info
51+
else:
52+
api_name_4_api_sign_and_desps[api_name] = dict(api_name_4_api_sign_and_desps[api_name], **tmp_api_path_api_info)
53+
54+
library_apis_reader.close()
55+
return api_name_4_api_sign_and_desps
56+
57+
def get_all_api_info_prompt_list_by_api_name(api_name_4_api_sign_and_desps, API_NAME):
58+
"""
59+
Get a dictionary of all {API_path: API_signature, API_description} based on the name of the API
60+
"""
61+
import sys
62+
from scripts.get_libs_info_from_code import (
63+
normalizer_api_desp,
64+
get_first_sentence_from_api_desp
65+
)
66+
67+
result_api_path_info_dict = dict()
68+
for api_name, api_path_info_dict in api_name_4_api_sign_and_desps.items():
69+
if api_name == API_NAME:
70+
for api_path, api_info_list in api_path_info_dict.items():
71+
api_signature, api_description = api_info_list[0], get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1]))
72+
73+
result_api_path_info_dict[api_path] = [api_signature, api_description]
74+
break
75+
return result_api_path_info_dict
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/usr/bin/env python
2+
# coding=utf-8
3+
#
4+
# @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
5+
# @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
6+
#
7+
# Licensed under the Apache License, Version 2.0 (the "License");
8+
# you may not use this file except in compliance with the License.
9+
# You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
from typing import List
19+
import json
20+
import gzip
21+
import os
22+
import sys
23+
sys.path.append("..")
24+
from scripts.get_comments_from_evallibs import get_comments_from_code
25+
# remove the sys path ".." to avoid the conflict with the other scripts
26+
sys.path.remove("..")
27+
28+
def get_one_instance_by_lib_name(library_name: str, base_dir: str):
29+
"""
30+
Get an iterative object based on lib_name
31+
"""
32+
base_dir = os.path.join(base_dir, "eval_datas")
33+
library_path = os.path.join(base_dir, f"real_{library_name}_eval_v2.jsonl.gz")
34+
35+
library_reader = gzip.open(library_path, "rb")
36+
for line in library_reader:
37+
line = line.decode("utf-8")
38+
line_dict = json.loads(line)
39+
yield line_dict
40+
41+
def get_code_and_comment_by_lib_name_and_task_id(
42+
library_name: str,
43+
query_task_id: str,
44+
base_dir: str
45+
):
46+
"""
47+
Get code, comments and solutions based on lib_name and task_id.
48+
"""
49+
# base_dir = f"/mnt/v-dzan/datasets/CERT/eval_datas"
50+
base_dir = os.path.join(base_dir, "eval_datas")
51+
library_path = os.path.join(base_dir, f"real_{library_name}_eval_v3.jsonl.gz")
52+
53+
library_reader = gzip.open(library_path, "rb")
54+
for line in library_reader:
55+
line = line.decode("utf-8")
56+
line_dict = json.loads(line)
57+
task_id = line_dict["task_id"]
58+
if task_id == query_task_id:
59+
code = line_dict["prompt"]
60+
solution = line_dict["canonical_solution"][0]
61+
code_comment = get_comments_from_code(code)
62+
library_reader.close()
63+
return [code, code_comment, solution]
64+
65+
library_reader.close()
66+
return ["", "", ""]
67+
68+
69+
if __name__ == "__main__":
70+
print("Passed!")
71+
pass

apicoder/CodeGenAPI/README.md

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# APICoder - CodeGenAPI
2+
3+
Official repository for our paper ["When Language Model Meets Private Library"]().
4+
5+
---
6+
7+
## Overview
8+
9+
APIRetirever finds out useful APIs for a programming problem, and then APICoder aims to generate code that solves the problem with these APIs. We make use of the most straightforward way for APICoder: prompting API information set in front of the context. Each API information is in the form of `name(signature):description`. This is to mimic programmers learning the APIs properly before writing code using them.
10+
11+
<img src=https://s3.bmp.ovh/imgs/2022/09/27/3691aaf9d0421991.png width=650 />
12+
13+
Figure1: The training process of CodeGenAPI
14+
15+
## Project Directory
16+
```shell
17+
├── CodeGenAPI
18+
│   ├── APICoder
19+
│   │   ├── get_api_info_by_name.py
20+
│   │   ├── get_lib_comment_for_eval.py
21+
│   ├── apex
22+
│   ├── eval_baseline.py
23+
│   ├── eval_private.py
24+
│   ├── nl2code
25+
│   ├── requirements.txt
26+
│   ├── run_generating_codes.sh # The entry script for CodeGenAPI inference, which can generate a lot of code snippets for each programming problem.
27+
│   ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k).
28+
│   ├── run_private.py
29+
│   ├── run_private.sh # Implementation of CodeGenAPI training.
30+
│   └── scripts
31+
│   ├── encode_private_data.py
32+
│   ├── extract_api.py
33+
│   ├── file_utils.py
34+
│   ├── get_comments_from_evallibs.py
35+
│   ├── get_libs_info_from_code.py
36+
│   ├── make_human_in_the_loop_test_corpus.py
37+
│   ├── multiprocessing_utils.py
38+
│   ├── pycode_visitor.py
39+
│   ├── requirements.txt
40+
│   ├── run_details_apis.sh # Extracting all kinds of API information (API name, signature, description and so on) from the crawled API documentations of 35 libraries.
41+
│   ├── run_encode_private_data.sh # Encoding the private data
42+
│   ├── run_extract_apis.sh # Crawling the API documentation for 31 off-the-shelf public libraries.
43+
│   └── run_extract_details_from_apis.py
44+
```
45+
46+
## Quickstart
47+
48+
This section covers environment, data preparation, model inference, and model training.
49+
50+
### Preparation
51+
52+
1、Configuring your runtime environment
53+
54+
```
55+
$ cd PrivateLibrary/CodeGenAPI
56+
$ pip install -r requirements.txt
57+
```
58+
Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library.
59+
```
60+
git clone https://github.com/NVIDIA/apex
61+
cd apex
62+
pip install -v --no-cache-dir ./
63+
```
64+
65+
2、Preparation of pre-trained models
66+
67+
Download the pre-trained checkpoint (e.g., `CodeGenAPI-110M`) from Google Drive and place it in the corresponding folder (e.g., `CodeGenAPI/models/CodeGenAPI-110M`).
68+
69+
3、Updating the scripts according to your local path
70+
71+
- Update `run_private.sh`.
72+
- Update `run_generating_codes.sh`.
73+
- Update `run_evaluating_codes.sh`.
74+
75+
### Use CodeGenAPI or other models
76+
77+
Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`).
78+
79+
```
80+
$ bash run_generating_codes.sh
81+
$ bash run_evaluating_codes.sh
82+
```
83+
84+
### Train CodeGenAPI
85+
86+
Train CodeGenAPI by the following command based on the large-scale code corpus.
87+
88+
```
89+
$ bash run_private.sh
90+
```
91+
92+
## Experiments
93+
94+
In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters.
95+
96+
Here are the main results:
97+
98+
![](https://s3.bmp.ovh/imgs/2022/09/27/1f28c06f5cc05bcc.png)
99+
100+
After running these numerous experiments, we drew some plausible observations and valuable insights as follows.
101+
102+
> (1) Prompting API information set is useful on private-library oriented code generation task.
103+
104+
> (2) Which is the best of the API prompt ways including Perfect, Top-$N$, and Human? As a general matter, Perfect, Human, and Top-$N$ produce progressively decreasing benefits. However, Top-$N$ is in occasion superior than Perfect as the noise exists when training the model. Also, we observe that Top-$1$,$2$ usually works better than Top-$3$,$5$ because the latter introduces more noise APIs.
105+
106+
> (3) Our continual pre-trained model does better at invoking APIs than to its base model, and thus can further elevate the performance of code generation for private libraries in majority of scenarios.
107+
108+
> (4) APIRetriever has the capability to retrieve useful APIs.
109+
110+
> (5) Involving human in the loop can further boost the performance.
111+
112+
> (6) As the $k$ in pass@$k$ grows larger, the gain we add API information brings is larger.
113+
114+
> (7) It is so challenging to generate code invoking private libraries than public ones, that large models fail to do so if we do not prompt any APIs.
115+
116+
For more explanation, please see our raw paper.
117+
118+
## Citation
119+
If you find our work useful, please cite the paper:
120+
```
121+
@inproceedings{APICoder,
122+
title={When Languange Model Meets Private Library},
123+
author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang},
124+
booktitle={EMNLP findings},
125+
year={2022}
126+
}
127+
```

0 commit comments

Comments
 (0)