microsoft
diff --git a/‎apicoder/CodeGenAPI/APICoder/get_api_info_by_name.py‎
Lines changed: 75 additions & 0 deletions b/‎apicoder/CodeGenAPI/APICoder/get_api_info_by_name.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎apicoder/CodeGenAPI/APICoder/get_lib_comment_for_eval.py‎
Lines changed: 71 additions & 0 deletions b/‎apicoder/CodeGenAPI/APICoder/get_lib_comment_for_eval.py‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎apicoder/CodeGenAPI/README.md‎
Lines changed: 127 additions & 0 deletions b/‎apicoder/CodeGenAPI/README.md‎
Lines changed: 127 additions & 0 deletions
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# coding=utf-8
+# 
+# @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
+# @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+
+def get_api_name_4_api_sign_and_desps(library_name: str, base_dir: str):
+    """
+    According to library_name, get all the API info of this library in the format shown in the following format.
+    """
+    # load the library_name's all api info
+    # base_dir = "/mnt/v-dzan/datasets/CERT/PrivateLibrary/Train"
+    base_dir = os.path.join(base_dir, "PrivateLibrary", "Train")
+    library_path = os.path.join(base_dir, library_name, f"{library_name}_apis_doc_details.jsonl")
+
+    library_apis_reader = open(library_path, "r")
+    api_name_4_api_sign_and_desps = {}
+    # The api_name_4_api_sign_and_desps format is:
+    # {
+    #    "api_name": {
+    #       api_path1: [api_sign1, api_desp1],
+    #       api_path2: [api_sign2, api_desp2],
+    #       ...
+    #   }
+    #   ...
+    # }
+    for line in library_apis_reader:
+        api_info = json.loads(line)
+        # (['api_path', 'api_name', 'api_doc', 'api_signature', 'api_description', 'api_parameters', 'api_parameters_number', 'api_returns', 'api_see_also', 'api_notes', 'api_examples'])
+        api_path = api_info["api_path"]
+        api_name = api_info["api_name"]
+        api_signature = api_info["api_signature"]
+        api_description = api_info["api_description"]
+        tmp_api_path_api_info = {api_path: [api_signature, api_description]}
+        if api_name_4_api_sign_and_desps.get(api_name) is None:
+            api_name_4_api_sign_and_desps[api_name] = tmp_api_path_api_info
+        else:
+            api_name_4_api_sign_and_desps[api_name] = dict(api_name_4_api_sign_and_desps[api_name], **tmp_api_path_api_info)
+
+    library_apis_reader.close()
+    return api_name_4_api_sign_and_desps
+
+def get_all_api_info_prompt_list_by_api_name(api_name_4_api_sign_and_desps, API_NAME):
+    """
+    Get a dictionary of all {API_path: API_signature, API_description} based on the name of the API
+    """
+    import sys
+    from scripts.get_libs_info_from_code import (
+        normalizer_api_desp,
+        get_first_sentence_from_api_desp
+    )
+
+    result_api_path_info_dict = dict()
+    for api_name, api_path_info_dict in api_name_4_api_sign_and_desps.items():
+        if api_name == API_NAME:
+            for api_path, api_info_list in api_path_info_dict.items():
+                api_signature, api_description = api_info_list[0], get_first_sentence_from_api_desp(normalizer_api_desp(api_info_list[1]))
+
+                result_api_path_info_dict[api_path] = [api_signature, api_description]
+            break
+    return result_api_path_info_dict
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+# coding=utf-8
+# 
+# @Author: Daoguang Zan, @Mentor: Bei Chen, Jian-Guang Lou
+# @Copyright 2022 The Microsoft Research Asia (DKI Group). All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+import json
+import gzip
+import os
+import sys
+sys.path.append("..")
+from scripts.get_comments_from_evallibs import get_comments_from_code
+# remove the sys path ".." to avoid the conflict with the other scripts
+sys.path.remove("..")
+
+def get_one_instance_by_lib_name(library_name: str, base_dir: str):
+    """
+    Get an iterative object based on lib_name
+    """
+    base_dir = os.path.join(base_dir, "eval_datas")
+    library_path = os.path.join(base_dir, f"real_{library_name}_eval_v2.jsonl.gz")
+
+    library_reader = gzip.open(library_path, "rb")
+    for line in library_reader:
+        line = line.decode("utf-8")
+        line_dict = json.loads(line)
+        yield line_dict
+
+def get_code_and_comment_by_lib_name_and_task_id(
+    library_name: str,
+    query_task_id: str,
+    base_dir: str
+):
+    """
+    Get code, comments and solutions based on lib_name and task_id.
+    """
+    # base_dir = f"/mnt/v-dzan/datasets/CERT/eval_datas"
+    base_dir = os.path.join(base_dir, "eval_datas")
+    library_path = os.path.join(base_dir, f"real_{library_name}_eval_v3.jsonl.gz")
+
+    library_reader = gzip.open(library_path, "rb")
+    for line in library_reader:
+        line = line.decode("utf-8")
+        line_dict = json.loads(line)
+        task_id = line_dict["task_id"]
+        if task_id == query_task_id:
+            code = line_dict["prompt"]
+            solution = line_dict["canonical_solution"][0]
+            code_comment = get_comments_from_code(code)
+            library_reader.close()
+            return [code, code_comment, solution]
+    
+    library_reader.close()
+    return ["", "", ""]
+
+
+if __name__ == "__main__":
+    print("Passed!")
+    pass
@@ -0,0 +1,127 @@
+# APICoder - CodeGenAPI
+
+Official repository for our paper ["When Language Model Meets Private Library"]().
+
+---
+
+## Overview
+
+APIRetirever finds out useful APIs for a programming problem, and then APICoder aims to generate code that solves the problem with these APIs. We make use of the most straightforward way for APICoder: prompting API information set in front of the context. Each API information is in the form of `name(signature):description`. This is to mimic programmers learning the APIs properly before writing code using them.
+
+<img src=https://s3.bmp.ovh/imgs/2022/09/27/3691aaf9d0421991.png width=650 />
+
+Figure1: The training process of CodeGenAPI
+
+## Project Directory
+```shell
+├── CodeGenAPI
+│   ├── APICoder
+│   │   ├── get_api_info_by_name.py
+│   │   ├── get_lib_comment_for_eval.py
+│   ├── apex
+│   ├── eval_baseline.py
+│   ├── eval_private.py
+│   ├── nl2code
+│   ├── requirements.txt
+│   ├── run_generating_codes.sh # The entry script for CodeGenAPI inference, which can generate a lot of code snippets for each programming problem.
+│   ├── run_evaluating_codes.sh # The entry script for evaluating the generated code snippets, and outputting the final results (pass@k).
+│   ├── run_private.py
+│   ├── run_private.sh # Implementation of CodeGenAPI training.
+│   └── scripts
+│       ├── encode_private_data.py
+│       ├── extract_api.py
+│       ├── file_utils.py
+│       ├── get_comments_from_evallibs.py
+│       ├── get_libs_info_from_code.py
+│       ├── make_human_in_the_loop_test_corpus.py
+│       ├── multiprocessing_utils.py
+│       ├── pycode_visitor.py
+│       ├── requirements.txt
+│       ├── run_details_apis.sh # Extracting all kinds of API information (API name, signature, description and so on) from the crawled API documentations of 35 libraries.
+│       ├── run_encode_private_data.sh # Encoding the private data
+│       ├── run_extract_apis.sh # Crawling the API documentation for 31 off-the-shelf public libraries.
+│       └── run_extract_details_from_apis.py
+```
+
+## Quickstart
+
+This section covers environment, data preparation, model inference, and model training.
+
+### Preparation
+
+1、Configuring your runtime environment
+
+```
+$ cd PrivateLibrary/CodeGenAPI
+$ pip install -r requirements.txt
+```
+Besides, if you would like to use mixed precision FP16 to speed up the training, it is necessary for you to install the apex library.
+```
+git clone https://github.com/NVIDIA/apex
+cd apex
+pip install -v --no-cache-dir ./
+```
+
+2、Preparation of pre-trained models
+
+Download the pre-trained checkpoint (e.g., `CodeGenAPI-110M`) from Google Drive and place it in the corresponding folder (e.g., `CodeGenAPI/models/CodeGenAPI-110M`).
+
+3、Updating the scripts according to your local path
+
+- Update `run_private.sh`.
+- Update `run_generating_codes.sh`.
+- Update `run_evaluating_codes.sh`.
+
+### Use CodeGenAPI or other models
+
+Firstly, multiple code snippets are generated for each programming problem (`run_generating_codes.sh`). Then, the code snippets are evaluated (`run_evaluating_codes.sh`).
+
+```
+$ bash run_generating_codes.sh
+$ bash run_evaluating_codes.sh
+```
+
+### Train CodeGenAPI
+
+Train CodeGenAPI by the following command based on the large-scale code corpus.
+
+```
+$ bash run_private.sh
+```
+
+## Experiments
+
+In inference phase, we set the `temperature` to one of `[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]`, the number of samples (`NUM_SAMPLES`) to `200`, the max number of generated tokens (`MAX_TOKNES`) to `100`, and the `top_p` to `0.9`. The best number is reported across the above hyper-parameters.
+
+Here are the main results:
+
+![](https://s3.bmp.ovh/imgs/2022/09/27/1f28c06f5cc05bcc.png)
+
+After running these numerous experiments, we drew some plausible observations and valuable insights as follows.
+
+> (1) Prompting API information set is useful on private-library oriented code generation task.
+
+> (2) Which is the best of the API prompt ways including Perfect, Top-$N$, and Human? As a general matter, Perfect, Human, and Top-$N$ produce progressively decreasing benefits. However, Top-$N$ is in occasion superior than Perfect as the noise exists when training the model. Also, we observe that Top-$1$,$2$ usually works better than Top-$3$,$5$ because the latter introduces more noise APIs. 
+
+> (3) Our continual pre-trained model does better at invoking APIs than to its base model, and thus can further elevate the performance of code generation for private libraries in majority of scenarios.
+
+> (4) APIRetriever has the capability to retrieve useful APIs.
+
+> (5) Involving human in the loop can further boost the performance.
+
+> (6) As the $k$ in pass@$k$ grows larger, the gain we add API information brings is larger.
+
+> (7) It is so challenging to generate code invoking private libraries than public ones, that large models fail to do so if we do not prompt any APIs.
+
+For more explanation, please see our raw paper.
+
+## Citation
+If you find our work useful, please cite the paper:
+```
+@inproceedings{APICoder,
+  title={When Languange Model Meets Private Library},
+  author={Zan, Daoguang and Chen, Bei and Lin, Zeqi and Guan, Bei and Wang, Yongji and Lou, Jian-Guang},
+  booktitle={EMNLP findings},
+  year={2022}
+}
+```