66 wget 'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip'
77 unzip java.zip
88 python notebooks/codesearchnet-opennmt.py \
9- --data_dir ='java/final/jsonl/valid' \
9+ --data-dir ='java/final/jsonl/valid' \
1010 --newline='\\ n'
1111"""
1212from argparse import ArgumentParser , Namespace
2020
2121logging .basicConfig (level = logging .INFO )
2222
23+ # catch SIGPIPE to make it nix CLI friendly e.g. | head
24+ from signal import signal , SIGPIPE , SIG_DFL
2325
24- class CodeSearchNetRAM (object ):
26+ signal (SIGPIPE , SIG_DFL )
27+
28+
29+ class CodeSearchNetRAM :
2530 """Stores one split of CodeSearchNet data in memory"""
2631
2732 def __init__ (self , split_path : Path , newline_repl : str ):
@@ -64,13 +69,10 @@ def __getitem__(self, idx: int) -> Tuple[str, str]:
6469
6570 # drop fn signature
6671 code = row ["code" ]
67- fn_body = (
68- code [
69- code .find ("{" , code .find (fn_name ) + len (fn_name )) + 1 : code .rfind ("}" )
70- ]
71- .lstrip ()
72- .rstrip ()
73- )
72+ fn_body = code [
73+ code .find ("{" , code .find (fn_name ) + len (fn_name )) + 1 : code .rfind ("}" )
74+ ]
75+ fn_body = fn_body .strip ()
7476 fn_body = fn_body .replace ("\n " , self .newline_repl )
7577 # fn_body_enc = self.enc.encode(fn_body)
7678
@@ -111,9 +113,7 @@ def main(args: Namespace) -> None:
111113 help = "Path to the unziped input data (CodeSearchNet)" ,
112114 )
113115
114- parser .add_argument (
115- "--newline" , type = str , default = "\\ n" , help = "Replace newline with this"
116- )
116+ parser .add_argument ("--newline" , default = "\\ n" , help = "Replace newline with this" )
117117
118118 parser .add_argument (
119119 "--token-level-sources" ,
@@ -128,14 +128,11 @@ def main(args: Namespace) -> None:
128128 )
129129
130130 parser .add_argument (
131- "--src_file" ,
132- type = str ,
133- default = "src-%s.token" ,
134- help = "File with function bodies" ,
131+ "--src-file" , default = "src-%s.token" , help = "File with function bodies" ,
135132 )
136133
137134 parser .add_argument (
138- "--tgt_file" , type = str , default = "tgt-%s.token" , help = "File with function texts"
135+ "--tgt-file" , default = "tgt-%s.token" , help = "File with function texts"
139136 )
140137
141138 parser .add_argument (
0 commit comments