2727 CodeLineCountFilter ,
2828 TextLineCountFilter ,
2929 clean_and_unify ,
30- dedupe ,
30+ exact_dedupe ,
3131 filter_code ,
3232 filter_text ,
33+ fuzzy_dedupe ,
3334 redact_code ,
35+ rm_dir ,
36+ semantic_dedupe ,
3437)
3538
3639import nemo_curator as nc
4851
4952SCRIPT_DIR_PATH = os .path .dirname (os .path .abspath (__file__ ))
5053DATA_DIR = os .path .join (SCRIPT_DIR_PATH , "data" )
54+ CONFIG_DIR = os .path .join (SCRIPT_DIR_PATH , "configs" )
5155
5256
5357def download_sources (
@@ -117,7 +121,6 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
117121 args (Any): Command-line arguments.
118122 jsonl_dir (str): Directory path where the JSONL files are stored.
119123 """
120- print ("Running the curation pipeline..." )
121124 # Initialize the Dask cluster.
122125 client = get_client (** ArgumentHelper .parse_client_args (args ))
123126
@@ -129,7 +132,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
129132 TextLineCountFilter (), text_field = "file_type_count" , score_type = bool
130133 ),
131134 filter_text ,
132- dedupe ,
135+ exact_dedupe ,
133136 ]
134137 )
135138
@@ -141,7 +144,7 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
141144 CodeLineCountFilter (), text_field = "file_type_count" , score_type = bool
142145 ),
143146 filter_code ,
144- dedupe ,
147+ exact_dedupe ,
145148 redact_code ,
146149 ]
147150 )
@@ -167,17 +170,54 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
167170 + orig_dataset_code .df ["line_count" ].astype (str )
168171 )
169172
173+ print ("Executing the curation pipeline..." )
170174 dataset_text = curation_steps_text (orig_dataset_text )
171- dataset_text = dataset_text .persist ()
172-
173- print (f"Original dataset length for text files: { len (orig_dataset_text .df )} " )
174- print (f"After dataprep: { len (dataset_text .df )} " )
175-
176175 dataset_code = curation_steps_code (orig_dataset_code )
177- dataset_code = dataset_code .persist ()
178176
177+ print (f"Original dataset length for text files: { len (orig_dataset_text .df )} " )
178+ print (f"After dataprep for text files: { len (dataset_text .df )} " )
179179 print (f"Original dataset length for code files: { len (orig_dataset_code .df )} " )
180- print (f"After dataprep: { len (dataset_code .df )} " )
180+ print (f"After dataprep length for code files: { len (dataset_code .df )} " )
181+
182+ if args .device == "gpu" :
183+ print ("Executing the semantic dedupe pipeline..." )
184+ gpu_dataset_text = DocumentDataset (dataset_text .df .to_backend ("cudf" ))
185+ gpu_dataset_code = DocumentDataset (dataset_code .df .to_backend ("cudf" ))
186+ sem_dedupe_config_yaml_path = os .path .join (
187+ CONFIG_DIR , "text_semantic_dedupe_config.yaml"
188+ )
189+ CACHE_DIR = os .path .join (SCRIPT_DIR_PATH , "cache" , "semantic_dedupe" , "text" )
190+ rm_dir (CACHE_DIR )
191+ duplicates = semantic_dedupe (
192+ dataset = gpu_dataset_text ,
193+ sem_dedupe_config_yaml_path = sem_dedupe_config_yaml_path ,
194+ cache = CACHE_DIR ,
195+ )
196+ unique_ids = duplicates .df .to_backend ("pandas" ).compute ()["id" ]
197+ semantic_dataset_text = DocumentDataset (
198+ gpu_dataset_text .df [gpu_dataset_text .df .id .isin (unique_ids )]
199+ )
200+ print (f"After semantic dedupe for text files: { len (semantic_dataset_text .df )} " )
201+
202+ print ("Executing the fuzzy dedupe pipeline..." )
203+ CACHE_DIR = os .path .join (SCRIPT_DIR_PATH , "cache" , "fuzzy_dedupe" , "text" )
204+ rm_dir (CACHE_DIR )
205+ fuzzy_dataset_text = fuzzy_dedupe (
206+ dataset = semantic_dataset_text , cache = CACHE_DIR
207+ )
208+ CACHE_DIR = os .path .join (SCRIPT_DIR_PATH , "cache" , "fuzzy_dedupe" , "code" )
209+ rm_dir (CACHE_DIR )
210+ fuzzy_dataset_code = fuzzy_dedupe (dataset = gpu_dataset_code , cache = CACHE_DIR )
211+
212+ dataset_text .df = fuzzy_dataset_text .df .to_backend ("pandas" )
213+ dataset_code .df = fuzzy_dataset_code .df .to_backend ("pandas" )
214+ print (f"After fuzzy dedupe for text files: { len (dataset_text .df )} " )
215+ print (f"After fuzzy dedupe: { len (dataset_code .df )} " )
216+
217+ final_dataset_text = dataset_text .persist ()
218+ final_dataset_code = dataset_code .persist ()
219+
220+ print ("Writing the results to disk..." )
181221
182222 # Overwrite existing files in the curated directory.
183223 out_path = os .path .join (DATA_DIR , "curated" )
@@ -186,15 +226,18 @@ def run_curation_pipeline(args: Any, text_files: str, code_files: str) -> None:
186226 shutil .rmtree (out_path )
187227
188228 os .makedirs (out_path )
189- dataset_text .to_json (out_path , write_to_filename = True )
190- dataset_code .to_json (out_path , write_to_filename = True )
229+ final_dataset_text .to_json (out_path , write_to_filename = True )
230+ final_dataset_code .to_json (out_path , write_to_filename = True )
231+
232+ print ("Writing results to disk completed" )
191233
192234 # Split the dataset by file category and save curated files (optional - to create blended datasets)
235+ print ("Split dataset by metadata" )
193236 separated_data_text = separate_by_metadata (
194- dataset_text .df , out_path , "category"
237+ final_dataset_text .df , out_path , "category"
195238 ).compute ()
196239 separated_data_code = separate_by_metadata (
197- dataset_code .df , out_path , "category"
240+ final_dataset_code .df , out_path , "category"
198241 ).compute ()
199242
200243 client .close ()
@@ -239,6 +282,7 @@ def main():
239282 # Download all the sources and get the list of text and code files.
240283 text_files , code_files = download_sources (100 , 100 , 100 )
241284 run_curation_pipeline (args , text_files , code_files )
285+ print ("Data Curation completed" )
242286
243287 # blend and shuffle datasets
244288 root_path = os .path .join (DATA_DIR , "curated" )
@@ -250,7 +294,9 @@ def main():
250294 ]
251295 dataset_weights = [1.0 , 4.0 , 4.0 , 1.0 ]
252296 target_size = 20
297+
253298 blend_and_shuffle (args , dataset_paths , dataset_weights , target_size )
299+ print ("Data Blending completed" )
254300
255301
256302if __name__ == "__main__" :
0 commit comments