From dcd7b2212a67ac1cccdb4061b8a691847e92ba31 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 31 Jan 2020 15:29:44 +0100 Subject: [PATCH 1/7] CLI: drop PT dependecny, add stdout output flag Signed-off-by: Alexander Bezzubov --- .gitignore | 2 ++ notebooks/codesearchnet-opennmt.py | 36 +++++++++++++++++++----------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index a7e73f1..641e8a2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ env.sh .mypy_cache notebooks/output notebooks/repos +.venv/ +.vscode/ diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index 321f301..ab7039d 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -1,3 +1,14 @@ +""" +CLI tool for converting CodeSearchNet dataset to OpenNMT format for +function name suggestion task. + +Usage example: + wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' + unzip java.zip + python notebooks/codesearchnet-opennmt.py \ + --data_dir='java/final/jsonl/valid' \ + --newline='\\n' +""" from argparse import ArgumentParser, Namespace import logging from pathlib import Path @@ -5,22 +16,13 @@ from typing import List, Tuple import pandas as pd -from torch.utils.data import Dataset logging.basicConfig(level=logging.INFO) -class CodeSearchNetRAM(Dataset): - """Stores one split of CodeSearchNet data in memory - - Usage example: - wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' - unzip java.zip - python notebooks/codesearchnet-opennmt.py \ - --data_dir='java/final/jsonl/valid' \ - --newline='\\n' - """ +class CodeSearchNetRAM(object): + """Stores one split of CodeSearchNet data in memory""" def __init__(self, split_path: Path, newline_repl: str): super().__init__() @@ -79,8 +81,12 @@ def main(args: Namespace) -> None: for fn_name, fn_body in dataset: if not fn_name or not fn_body: continue - print(fn_body, file=s) - print(fn_name if args.word_level_targets else " ".join(fn_name), file=t) + tgt = fn_name if args.word_level_targets else " ".join(fn_name) + if args.print: + print(f"'{fn_name[:40]:40}' - '{tgt[:40]:40}'") + else: + print(fn_body, file=s) + print(tgt, file=t) if __name__ == "__main__": @@ -110,5 +116,9 @@ def main(args: Namespace) -> None: "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts" ) + parser.add_argument( + "--print", action="store_true", help="Print data preview to the STDOUT" + ) + args = parser.parse_args() main(args) From 16063f1d509aee11292d25aefcd4f3dc0ae9fdea Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Fri, 31 Jan 2020 16:11:03 +0100 Subject: [PATCH 2/7] CLI: add --token-level-sources option Signed-off-by: Alexander Bezzubov --- notebooks/codesearchnet-opennmt.py | 40 +++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index ab7039d..17c6bef 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -27,12 +27,13 @@ class CodeSearchNetRAM(object): def __init__(self, split_path: Path, newline_repl: str): super().__init__() self.pd = pd + self.newline_repl = newline_repl files = sorted(split_path.glob("**/*.gz")) logging.info(f"Total number of files: {len(files):,}") assert files, "could not find files under %s" % split_path - columns_list = ["code", "func_name"] + columns_list = ["code", "func_name", "code_tokens"] start = time() self.pd = self._jsonl_list_to_dataframe(files, columns_list) @@ -63,10 +64,21 @@ def __getitem__(self, idx: int) -> Tuple[str, str]: # drop fn signature code = row["code"] - fn_body = code[code.find("{") + 1 : code.rfind("}")].lstrip().rstrip() - fn_body = fn_body.replace("\n", "\\n") + fn_body = ( + code[ + code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}") + ] + .lstrip() + .rstrip() + ) + fn_body = fn_body.replace("\n", self.newline_repl) # fn_body_enc = self.enc.encode(fn_body) - return (fn_name, fn_body) + + tokens = row["code_tokens"] + body_tokens = tokens[tokens.index(fn_name) + 2 :] + fn_body_tokens = body_tokens[body_tokens.index("{") + 1 : len(body_tokens) - 1] + + return (fn_name, fn_body, fn_body_tokens) def __len__(self) -> int: return len(self.pd) @@ -78,14 +90,15 @@ def main(args: Namespace) -> None: with open(args.src_file % split_name, mode="w", encoding="utf8") as s, open( args.tgt_file % split_name, mode="w", encoding="utf8" ) as t: - for fn_name, fn_body in dataset: + for fn_name, fn_body, fn_body_tokens in dataset: if not fn_name or not fn_body: continue + src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body tgt = fn_name if args.word_level_targets else " ".join(fn_name) if args.print: - print(f"'{fn_name[:40]:40}' - '{tgt[:40]:40}'") + print(f"'{tgt[:40]:40}' - '{src[:40]:40}'") else: - print(fn_body, file=s) + print(src, file=s) print(tgt, file=t) @@ -102,6 +115,12 @@ def main(args: Namespace) -> None: "--newline", type=str, default="\\n", help="Replace newline with this" ) + parser.add_argument( + "--token-level-sources", + action="store_true", + help="Use language-specific token sources instead of word level ones", + ) + parser.add_argument( "--word-level-targets", action="store_true", @@ -109,11 +128,14 @@ def main(args: Namespace) -> None: ) parser.add_argument( - "--src_file", type=str, default="src-%s.txt", help="File with function bodies", + "--src_file", + type=str, + default="src-%s.token", + help="File with function bodies", ) parser.add_argument( - "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts" + "--tgt_file", type=str, default="tgt-%s.token", help="File with function texts" ) parser.add_argument( From aa65cc4ad72333788e9dcc175e3fabb79bf2db64 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 3 Feb 2020 12:54:44 +0100 Subject: [PATCH 3/7] cli: address code review Signed-off-by: Alexander Bezzubov --- .gitignore | 1 - notebooks/codesearchnet-opennmt.py | 31 ++++++++++++++---------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/.gitignore b/.gitignore index 641e8a2..c5ef002 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,4 @@ env.sh .mypy_cache notebooks/output notebooks/repos -.venv/ .vscode/ diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index 17c6bef..c111479 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -6,7 +6,7 @@ wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' unzip java.zip python notebooks/codesearchnet-opennmt.py \ - --data_dir='java/final/jsonl/valid' \ + --data-dir='java/final/jsonl/valid' \ --newline='\\n' """ from argparse import ArgumentParser, Namespace @@ -20,8 +20,13 @@ logging.basicConfig(level=logging.INFO) +# catch SIGPIPE to make it nix CLI friendly e.g. | head +from signal import signal, SIGPIPE, SIG_DFL -class CodeSearchNetRAM(object): +signal(SIGPIPE, SIG_DFL) + + +class CodeSearchNetRAM: """Stores one split of CodeSearchNet data in memory""" def __init__(self, split_path: Path, newline_repl: str): @@ -64,13 +69,10 @@ def __getitem__(self, idx: int) -> Tuple[str, str]: # drop fn signature code = row["code"] - fn_body = ( - code[ - code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}") - ] - .lstrip() - .rstrip() - ) + fn_body = code[ + code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}") + ] + fn_body = fn_body.strip() fn_body = fn_body.replace("\n", self.newline_repl) # fn_body_enc = self.enc.encode(fn_body) @@ -111,9 +113,7 @@ def main(args: Namespace) -> None: help="Path to the unziped input data (CodeSearchNet)", ) - parser.add_argument( - "--newline", type=str, default="\\n", help="Replace newline with this" - ) + parser.add_argument("--newline", default="\\n", help="Replace newline with this") parser.add_argument( "--token-level-sources", @@ -128,14 +128,11 @@ def main(args: Namespace) -> None: ) parser.add_argument( - "--src_file", - type=str, - default="src-%s.token", - help="File with function bodies", + "--src-file", default="src-%s.token", help="File with function bodies", ) parser.add_argument( - "--tgt_file", type=str, default="tgt-%s.token", help="File with function texts" + "--tgt-file", default="tgt-%s.token", help="File with function texts" ) parser.add_argument( From 6cfc883af370150fe74e9099e45be7acb7493352 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 3 Feb 2020 13:11:03 +0100 Subject: [PATCH 4/7] cli: add --token-level-targets \w dpu_utils.codeutils.identifiersplitting Signed-off-by: Alexander Bezzubov --- notebooks/codesearchnet-opennmt.py | 77 +++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index c111479..fec6e32 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -86,6 +86,69 @@ def __len__(self) -> int: return len(self.pd) +# id splitting from +# https://github.com/microsoft/dpu-utils/blob/dfc44e354b57a4e2617828bdf4d76c1c4d81c021/python/dpu_utils/codeutils/identifiersplitting.py +from functools import lru_cache +from typing import List + +def split_camelcase(camel_case_identifier: str) -> List[str]: + """ + Split camelCase identifiers. + """ + if not len(camel_case_identifier): + return [] + + # split into words based on adjacent cases being the same + result = [] + current = str(camel_case_identifier[0]) + prev_upper = camel_case_identifier[0].isupper() + prev_digit = camel_case_identifier[0].isdigit() + prev_special = not camel_case_identifier[0].isalnum() + for c in camel_case_identifier[1:]: + upper = c.isupper() + digit = c.isdigit() + special = not c.isalnum() + new_upper_word = upper and not prev_upper + new_digit_word = digit and not prev_digit + new_special_word = special and not prev_special + if new_digit_word or new_upper_word or new_special_word: + result.append(current) + current = c + elif not upper and prev_upper and len(current) > 1: + result.append(current[:-1]) + current = current[-1] + c + elif not digit and prev_digit: + result.append(current) + current = c + elif not special and prev_special: + result.append(current) + current = c + else: + current += c + prev_digit = digit + prev_upper = upper + prev_special = special + result.append(current) + return result + + +@lru_cache(maxsize=5000) +def split_identifier_into_parts(identifier: str) -> List[str]: + """ + Split a single identifier into parts on snake_case and camelCase + """ + snake_case = identifier.split("_") + + identifier_parts = [] # type: List[str] + for i in range(len(snake_case)): + part = snake_case[i] + if len(part) > 0: + identifier_parts.extend(s.lower() for s in split_camelcase(part)) + if len(identifier_parts) == 0: + return [identifier] + return identifier_parts + + def main(args: Namespace) -> None: dataset = CodeSearchNetRAM(Path(args.data_dir), args.newline) split_name = Path(args.data_dir).name @@ -96,7 +159,13 @@ def main(args: Namespace) -> None: if not fn_name or not fn_body: continue src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body - tgt = fn_name if args.word_level_targets else " ".join(fn_name) + + if args.word_level_targets: + tgt = fn_name + elif args.token_level_targets: + tgt = " ".join(split_identifier_into_parts(fn_name)) + else: + tgt = " ".join(fn_name) if args.print: print(f"'{tgt[:40]:40}' - '{src[:40]:40}'") else: @@ -121,6 +190,12 @@ def main(args: Namespace) -> None: help="Use language-specific token sources instead of word level ones", ) + parser.add_argument( + "--token-level-targets", + action="store_true", + help="Use camlCase and snake_case split token sources instead of word or char level ones", + ) + parser.add_argument( "--word-level-targets", action="store_true", From a1da1f591961b3b586f0ea8d2c482863dde13ef4 Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 3 Feb 2020 13:26:07 +0100 Subject: [PATCH 5/7] cli: switch output to .txt Signed-off-by: Alexander Bezzubov --- notebooks/codesearchnet-opennmt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index fec6e32..89aac97 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -203,11 +203,11 @@ def main(args: Namespace) -> None: ) parser.add_argument( - "--src-file", default="src-%s.token", help="File with function bodies", + "--src-file", default="src-%s.txt", help="File with function bodies", ) parser.add_argument( - "--tgt-file", default="tgt-%s.token", help="File with function texts" + "--tgt-file", default="tgt-%s.txt", help="File with function texts" ) parser.add_argument( From 96ceb0d90f8b55c2ea8c02fe563e2d4c10a01a2f Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 3 Feb 2020 13:27:31 +0100 Subject: [PATCH 6/7] cli: fix a typo Signed-off-by: Alexander Bezzubov --- notebooks/codesearchnet-opennmt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index 89aac97..248cd5d 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -176,7 +176,7 @@ def main(args: Namespace) -> None: if __name__ == "__main__": parser = ArgumentParser(add_help=False) parser.add_argument( - "--data_dir", + "--data-dir", type=str, default="java/final/jsonl/test", help="Path to the unziped input data (CodeSearchNet)", From b20d69b565b140b9f9e9aba0c2aafbd9faa90f2f Mon Sep 17 00:00:00 2001 From: Alexander Bezzubov Date: Mon, 3 Feb 2020 13:37:43 +0100 Subject: [PATCH 7/7] cli: error handling for tokens Signed-off-by: Alexander Bezzubov --- notebooks/codesearchnet-opennmt.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py index 248cd5d..75e4556 100644 --- a/notebooks/codesearchnet-opennmt.py +++ b/notebooks/codesearchnet-opennmt.py @@ -78,7 +78,13 @@ def __getitem__(self, idx: int) -> Tuple[str, str]: tokens = row["code_tokens"] body_tokens = tokens[tokens.index(fn_name) + 2 :] - fn_body_tokens = body_tokens[body_tokens.index("{") + 1 : len(body_tokens) - 1] + try: + fn_body_tokens = body_tokens[ + body_tokens.index("{") + 1 : len(body_tokens) - 1 + ] + except ValueError as ve: # '{' might be missing + logging.error("'%s' fn body extraction failed: %s", body_tokens, ve) + fn_body_tokens = None return (fn_name, fn_body, fn_body_tokens) @@ -91,6 +97,7 @@ def __len__(self) -> int: from functools import lru_cache from typing import List + def split_camelcase(camel_case_identifier: str) -> List[str]: """ Split camelCase identifiers. @@ -158,7 +165,13 @@ def main(args: Namespace) -> None: for fn_name, fn_body, fn_body_tokens in dataset: if not fn_name or not fn_body: continue - src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body + + if args.token_level_sources: + if not fn_body_tokens: + continue + src = " ".join(fn_body_tokens).replace("\n", args.newline) + else: + src = fn_body if args.word_level_targets: tgt = fn_name