From dcd7b2212a67ac1cccdb4061b8a691847e92ba31 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 31 Jan 2020 15:29:44 +0100
Subject: [PATCH 1/7] CLI: drop PT dependecny, add stdout output flag

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .gitignore                         |  2 ++
 notebooks/codesearchnet-opennmt.py | 36 +++++++++++++++++++-----------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index a7e73f1..641e8a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,5 @@ env.sh
 .mypy_cache
 notebooks/output
 notebooks/repos
+.venv/
+.vscode/
diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index 321f301..ab7039d 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -1,3 +1,14 @@
+"""
+CLI tool for converting CodeSearchNet dataset to OpenNMT format for
+function name suggestion task.
+
+Usage example:
+    wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
+    unzip java.zip
+    python notebooks/codesearchnet-opennmt.py \
+        --data_dir='java/final/jsonl/valid' \
+        --newline='\\n'
+"""
 from argparse import ArgumentParser, Namespace
 import logging
 from pathlib import Path
@@ -5,22 +16,13 @@
 from typing import List, Tuple
 
 import pandas as pd
-from torch.utils.data import Dataset
 
 
 logging.basicConfig(level=logging.INFO)
 
 
-class CodeSearchNetRAM(Dataset):
-    """Stores one split of CodeSearchNet data in memory
-
-    Usage example:
-        wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
-        unzip java.zip
-        python notebooks/codesearchnet-opennmt.py \
-            --data_dir='java/final/jsonl/valid' \
-            --newline='\\n'
-    """
+class CodeSearchNetRAM(object):
+    """Stores one split of CodeSearchNet data in memory"""
 
     def __init__(self, split_path: Path, newline_repl: str):
         super().__init__()
@@ -79,8 +81,12 @@ def main(args: Namespace) -> None:
         for fn_name, fn_body in dataset:
             if not fn_name or not fn_body:
                 continue
-            print(fn_body, file=s)
-            print(fn_name if args.word_level_targets else " ".join(fn_name), file=t)
+            tgt = fn_name if args.word_level_targets else " ".join(fn_name)
+            if args.print:
+                print(f"'{fn_name[:40]:40}' - '{tgt[:40]:40}'")
+            else:
+                print(fn_body, file=s)
+                print(tgt, file=t)
 
 
 if __name__ == "__main__":
@@ -110,5 +116,9 @@ def main(args: Namespace) -> None:
         "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts"
     )
 
+    parser.add_argument(
+        "--print", action="store_true", help="Print data preview to the STDOUT"
+    )
+
     args = parser.parse_args()
     main(args)

From 16063f1d509aee11292d25aefcd4f3dc0ae9fdea Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Fri, 31 Jan 2020 16:11:03 +0100
Subject: [PATCH 2/7] CLI: add --token-level-sources option

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 notebooks/codesearchnet-opennmt.py | 40 +++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index ab7039d..17c6bef 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -27,12 +27,13 @@ class CodeSearchNetRAM(object):
     def __init__(self, split_path: Path, newline_repl: str):
         super().__init__()
         self.pd = pd
+        self.newline_repl = newline_repl
 
         files = sorted(split_path.glob("**/*.gz"))
         logging.info(f"Total number of files: {len(files):,}")
         assert files, "could not find files under %s" % split_path
 
-        columns_list = ["code", "func_name"]
+        columns_list = ["code", "func_name", "code_tokens"]
 
         start = time()
         self.pd = self._jsonl_list_to_dataframe(files, columns_list)
@@ -63,10 +64,21 @@ def __getitem__(self, idx: int) -> Tuple[str, str]:
 
         # drop fn signature
         code = row["code"]
-        fn_body = code[code.find("{") + 1 : code.rfind("}")].lstrip().rstrip()
-        fn_body = fn_body.replace("\n", "\\n")
+        fn_body = (
+            code[
+                code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}")
+            ]
+            .lstrip()
+            .rstrip()
+        )
+        fn_body = fn_body.replace("\n", self.newline_repl)
         # fn_body_enc = self.enc.encode(fn_body)
-        return (fn_name, fn_body)
+
+        tokens = row["code_tokens"]
+        body_tokens = tokens[tokens.index(fn_name) + 2 :]
+        fn_body_tokens = body_tokens[body_tokens.index("{") + 1 : len(body_tokens) - 1]
+
+        return (fn_name, fn_body, fn_body_tokens)
 
     def __len__(self) -> int:
         return len(self.pd)
@@ -78,14 +90,15 @@ def main(args: Namespace) -> None:
     with open(args.src_file % split_name, mode="w", encoding="utf8") as s, open(
         args.tgt_file % split_name, mode="w", encoding="utf8"
     ) as t:
-        for fn_name, fn_body in dataset:
+        for fn_name, fn_body, fn_body_tokens in dataset:
             if not fn_name or not fn_body:
                 continue
+            src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body
             tgt = fn_name if args.word_level_targets else " ".join(fn_name)
             if args.print:
-                print(f"'{fn_name[:40]:40}' - '{tgt[:40]:40}'")
+                print(f"'{tgt[:40]:40}' - '{src[:40]:40}'")
             else:
-                print(fn_body, file=s)
+                print(src, file=s)
                 print(tgt, file=t)
 
 
@@ -102,6 +115,12 @@ def main(args: Namespace) -> None:
         "--newline", type=str, default="\\n", help="Replace newline with this"
     )
 
+    parser.add_argument(
+        "--token-level-sources",
+        action="store_true",
+        help="Use language-specific token sources instead of word level ones",
+    )
+
     parser.add_argument(
         "--word-level-targets",
         action="store_true",
@@ -109,11 +128,14 @@ def main(args: Namespace) -> None:
     )
 
     parser.add_argument(
-        "--src_file", type=str, default="src-%s.txt", help="File with function bodies",
+        "--src_file",
+        type=str,
+        default="src-%s.token",
+        help="File with function bodies",
     )
 
     parser.add_argument(
-        "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts"
+        "--tgt_file", type=str, default="tgt-%s.token", help="File with function texts"
     )
 
     parser.add_argument(

From aa65cc4ad72333788e9dcc175e3fabb79bf2db64 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 3 Feb 2020 12:54:44 +0100
Subject: [PATCH 3/7] cli: address code review

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 .gitignore                         |  1 -
 notebooks/codesearchnet-opennmt.py | 31 ++++++++++++++----------------
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index 641e8a2..c5ef002 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,4 @@ env.sh
 .mypy_cache
 notebooks/output
 notebooks/repos
-.venv/
 .vscode/
diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index 17c6bef..c111479 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -6,7 +6,7 @@
     wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
     unzip java.zip
     python notebooks/codesearchnet-opennmt.py \
-        --data_dir='java/final/jsonl/valid' \
+        --data-dir='java/final/jsonl/valid' \
         --newline='\\n'
 """
 from argparse import ArgumentParser, Namespace
@@ -20,8 +20,13 @@
 
 logging.basicConfig(level=logging.INFO)
 
+# catch SIGPIPE to make it nix CLI friendly e.g. | head
+from signal import signal, SIGPIPE, SIG_DFL
 
-class CodeSearchNetRAM(object):
+signal(SIGPIPE, SIG_DFL)
+
+
+class CodeSearchNetRAM:
     """Stores one split of CodeSearchNet data in memory"""
 
     def __init__(self, split_path: Path, newline_repl: str):
@@ -64,13 +69,10 @@ def __getitem__(self, idx: int) -> Tuple[str, str]:
 
         # drop fn signature
         code = row["code"]
-        fn_body = (
-            code[
-                code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}")
-            ]
-            .lstrip()
-            .rstrip()
-        )
+        fn_body = code[
+            code.find("{", code.find(fn_name) + len(fn_name)) + 1 : code.rfind("}")
+        ]
+        fn_body = fn_body.strip()
         fn_body = fn_body.replace("\n", self.newline_repl)
         # fn_body_enc = self.enc.encode(fn_body)
 
@@ -111,9 +113,7 @@ def main(args: Namespace) -> None:
         help="Path to the unziped input data (CodeSearchNet)",
     )
 
-    parser.add_argument(
-        "--newline", type=str, default="\\n", help="Replace newline with this"
-    )
+    parser.add_argument("--newline", default="\\n", help="Replace newline with this")
 
     parser.add_argument(
         "--token-level-sources",
@@ -128,14 +128,11 @@ def main(args: Namespace) -> None:
     )
 
     parser.add_argument(
-        "--src_file",
-        type=str,
-        default="src-%s.token",
-        help="File with function bodies",
+        "--src-file", default="src-%s.token", help="File with function bodies",
     )
 
     parser.add_argument(
-        "--tgt_file", type=str, default="tgt-%s.token", help="File with function texts"
+        "--tgt-file", default="tgt-%s.token", help="File with function texts"
     )
 
     parser.add_argument(

From 6cfc883af370150fe74e9099e45be7acb7493352 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 3 Feb 2020 13:11:03 +0100
Subject: [PATCH 4/7] cli: add --token-level-targets \w
 dpu_utils.codeutils.identifiersplitting

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 notebooks/codesearchnet-opennmt.py | 77 +++++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index c111479..fec6e32 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -86,6 +86,69 @@ def __len__(self) -> int:
         return len(self.pd)
 
 
+# id splitting from
+# https://github.com/microsoft/dpu-utils/blob/dfc44e354b57a4e2617828bdf4d76c1c4d81c021/python/dpu_utils/codeutils/identifiersplitting.py
+from functools import lru_cache
+from typing import List
+
+def split_camelcase(camel_case_identifier: str) -> List[str]:
+    """
+    Split camelCase identifiers.
+    """
+    if not len(camel_case_identifier):
+        return []
+
+    # split into words based on adjacent cases being the same
+    result = []
+    current = str(camel_case_identifier[0])
+    prev_upper = camel_case_identifier[0].isupper()
+    prev_digit = camel_case_identifier[0].isdigit()
+    prev_special = not camel_case_identifier[0].isalnum()
+    for c in camel_case_identifier[1:]:
+        upper = c.isupper()
+        digit = c.isdigit()
+        special = not c.isalnum()
+        new_upper_word = upper and not prev_upper
+        new_digit_word = digit and not prev_digit
+        new_special_word = special and not prev_special
+        if new_digit_word or new_upper_word or new_special_word:
+            result.append(current)
+            current = c
+        elif not upper and prev_upper and len(current) > 1:
+            result.append(current[:-1])
+            current = current[-1] + c
+        elif not digit and prev_digit:
+            result.append(current)
+            current = c
+        elif not special and prev_special:
+            result.append(current)
+            current = c
+        else:
+            current += c
+        prev_digit = digit
+        prev_upper = upper
+        prev_special = special
+    result.append(current)
+    return result
+
+
+@lru_cache(maxsize=5000)
+def split_identifier_into_parts(identifier: str) -> List[str]:
+    """
+    Split a single identifier into parts on snake_case and camelCase
+    """
+    snake_case = identifier.split("_")
+
+    identifier_parts = []  # type: List[str]
+    for i in range(len(snake_case)):
+        part = snake_case[i]
+        if len(part) > 0:
+            identifier_parts.extend(s.lower() for s in split_camelcase(part))
+    if len(identifier_parts) == 0:
+        return [identifier]
+    return identifier_parts
+
+
 def main(args: Namespace) -> None:
     dataset = CodeSearchNetRAM(Path(args.data_dir), args.newline)
     split_name = Path(args.data_dir).name
@@ -96,7 +159,13 @@ def main(args: Namespace) -> None:
             if not fn_name or not fn_body:
                 continue
             src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body
-            tgt = fn_name if args.word_level_targets else " ".join(fn_name)
+
+            if args.word_level_targets:
+                tgt = fn_name
+            elif args.token_level_targets:
+                tgt = " ".join(split_identifier_into_parts(fn_name))
+            else:
+                tgt = " ".join(fn_name)
             if args.print:
                 print(f"'{tgt[:40]:40}' - '{src[:40]:40}'")
             else:
@@ -121,6 +190,12 @@ def main(args: Namespace) -> None:
         help="Use language-specific token sources instead of word level ones",
     )
 
+    parser.add_argument(
+        "--token-level-targets",
+        action="store_true",
+        help="Use camlCase and snake_case split token sources instead of word or char level ones",
+    )
+
     parser.add_argument(
         "--word-level-targets",
         action="store_true",

From a1da1f591961b3b586f0ea8d2c482863dde13ef4 Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 3 Feb 2020 13:26:07 +0100
Subject: [PATCH 5/7] cli: switch output to .txt

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 notebooks/codesearchnet-opennmt.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index fec6e32..89aac97 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -203,11 +203,11 @@ def main(args: Namespace) -> None:
     )
 
     parser.add_argument(
-        "--src-file", default="src-%s.token", help="File with function bodies",
+        "--src-file", default="src-%s.txt", help="File with function bodies",
     )
 
     parser.add_argument(
-        "--tgt-file", default="tgt-%s.token", help="File with function texts"
+        "--tgt-file", default="tgt-%s.txt", help="File with function texts"
     )
 
     parser.add_argument(

From 96ceb0d90f8b55c2ea8c02fe563e2d4c10a01a2f Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 3 Feb 2020 13:27:31 +0100
Subject: [PATCH 6/7] cli: fix a typo

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 notebooks/codesearchnet-opennmt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index 89aac97..248cd5d 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -176,7 +176,7 @@ def main(args: Namespace) -> None:
 if __name__ == "__main__":
     parser = ArgumentParser(add_help=False)
     parser.add_argument(
-        "--data_dir",
+        "--data-dir",
         type=str,
         default="java/final/jsonl/test",
         help="Path to the unziped input data (CodeSearchNet)",

From b20d69b565b140b9f9e9aba0c2aafbd9faa90f2f Mon Sep 17 00:00:00 2001
From: Alexander Bezzubov <bzz@apache.org>
Date: Mon, 3 Feb 2020 13:37:43 +0100
Subject: [PATCH 7/7] cli: error handling for tokens

Signed-off-by: Alexander Bezzubov <bzz@apache.org>
---
 notebooks/codesearchnet-opennmt.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/notebooks/codesearchnet-opennmt.py b/notebooks/codesearchnet-opennmt.py
index 248cd5d..75e4556 100644
--- a/notebooks/codesearchnet-opennmt.py
+++ b/notebooks/codesearchnet-opennmt.py
@@ -78,7 +78,13 @@ def __getitem__(self, idx: int) -> Tuple[str, str]:
 
         tokens = row["code_tokens"]
         body_tokens = tokens[tokens.index(fn_name) + 2 :]
-        fn_body_tokens = body_tokens[body_tokens.index("{") + 1 : len(body_tokens) - 1]
+        try:
+            fn_body_tokens = body_tokens[
+                body_tokens.index("{") + 1 : len(body_tokens) - 1
+            ]
+        except ValueError as ve:  # '{' might be missing
+            logging.error("'%s' fn body extraction failed: %s", body_tokens, ve)
+            fn_body_tokens = None
 
         return (fn_name, fn_body, fn_body_tokens)
 
@@ -91,6 +97,7 @@ def __len__(self) -> int:
 from functools import lru_cache
 from typing import List
 
+
 def split_camelcase(camel_case_identifier: str) -> List[str]:
     """
     Split camelCase identifiers.
@@ -158,7 +165,13 @@ def main(args: Namespace) -> None:
         for fn_name, fn_body, fn_body_tokens in dataset:
             if not fn_name or not fn_body:
                 continue
-            src = " ".join(fn_body_tokens) if args.token_level_sources else fn_body
+
+            if args.token_level_sources:
+                if not fn_body_tokens:
+                    continue
+                src = " ".join(fn_body_tokens).replace("\n", args.newline)
+            else:
+                src = fn_body
 
             if args.word_level_targets:
                 tgt = fn_name