|
| 1 | +""" |
| 2 | +CLI tool for converting CodeSearchNet dataset to OpenNMT format for |
| 3 | +function name suggestion task. |
| 4 | +
|
| 5 | +Usage example: |
| 6 | + wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' |
| 7 | + unzip java.zip |
| 8 | + python notebooks/codesearchnet-opennmt.py \ |
| 9 | + --data_dir='java/final/jsonl/valid' \ |
| 10 | + --newline='\\n' |
| 11 | +""" |
1 | 12 | from argparse import ArgumentParser, Namespace |
2 | 13 | import logging |
3 | 14 | from pathlib import Path |
4 | 15 | from time import time |
5 | 16 | from typing import List, Tuple |
6 | 17 |
|
7 | 18 | import pandas as pd |
8 | | -from torch.utils.data import Dataset |
9 | 19 |
|
10 | 20 |
|
11 | 21 | logging.basicConfig(level=logging.INFO) |
12 | 22 |
|
13 | 23 |
|
14 | | -class CodeSearchNetRAM(Dataset): |
15 | | - """Stores one split of CodeSearchNet data in memory |
16 | | -
|
17 | | - Usage example: |
18 | | - wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip' |
19 | | - unzip java.zip |
20 | | - python notebooks/codesearchnet-opennmt.py \ |
21 | | - --data_dir='java/final/jsonl/valid' \ |
22 | | - --newline='\\n' |
23 | | - """ |
| 24 | +class CodeSearchNetRAM(object): |
| 25 | + """Stores one split of CodeSearchNet data in memory""" |
24 | 26 |
|
25 | 27 | def __init__(self, split_path: Path, newline_repl: str): |
26 | 28 | super().__init__() |
@@ -79,8 +81,12 @@ def main(args: Namespace) -> None: |
79 | 81 | for fn_name, fn_body in dataset: |
80 | 82 | if not fn_name or not fn_body: |
81 | 83 | continue |
82 | | - print(fn_body, file=s) |
83 | | - print(fn_name if args.word_level_targets else " ".join(fn_name), file=t) |
| 84 | + tgt = fn_name if args.word_level_targets else " ".join(fn_name) |
| 85 | + if args.print: |
| 86 | + print(f"'{fn_name[:40]:40}' - '{tgt[:40]:40}'") |
| 87 | + else: |
| 88 | + print(fn_body, file=s) |
| 89 | + print(tgt, file=t) |
84 | 90 |
|
85 | 91 |
|
86 | 92 | if __name__ == "__main__": |
@@ -110,5 +116,9 @@ def main(args: Namespace) -> None: |
110 | 116 | "--tgt_file", type=str, default="tgt-%s.txt", help="File with function texts" |
111 | 117 | ) |
112 | 118 |
|
| 119 | + parser.add_argument( |
| 120 | + "--print", action="store_true", help="Print data preview to the STDOUT" |
| 121 | + ) |
| 122 | + |
113 | 123 | args = parser.parse_args() |
114 | 124 | main(args) |
0 commit comments