@@ -55,10 +55,16 @@ datasets:
5555 formats :
5656 - type : " tar"
5757 path : " {datasets_path}/arxiv_downloads"
58- - name : " fasttext_model "
58+ - name : " fasttext_langid_model "
5959 formats :
6060 - type : " bin"
6161 path : " {model_weights_path}/fasttext/lid.176.bin"
62+ - type : " ftz"
63+ path : " {model_weights_path}/fasttext/lid.176.ftz"
64+ - name : " fasttext_quality_model"
65+ formats :
66+ - type : " bin"
67+ path : " {model_weights_path}/fasttext/model.bin"
6268 - name : " gretel_symptoms"
6369 formats :
6470 - type : " jsonl"
7379# experiment: ray-curator-common-crawl
7480 - name : slack
7581 enabled : true
76- webhook_url : ${SLACK_WEBHOOK_URL}
82+ live_updates : true
83+ channel_id : ${SLACK_CHANNEL_ID}
7784 default_metrics : ["exec_time_s"]
7885# - name: gdrive
7986# enabled: false
@@ -411,6 +418,52 @@ entries:
411418 - metric : throughput_docs_per_sec
412419 min_value : 8500
413420
421+ - name : fasttext_filter_raydata
422+ enabled : true
423+ script : fasttext_filter_benchmark.py
424+ args : >-
425+ --benchmark-results-path={session_entry_dir}
426+ --output-path={session_entry_dir}/scratch/output
427+ --executor=ray_data
428+ --input-path={dataset:tinystories,parquet}
429+ --yaml-config={curator_repo_dir}/nemo_curator/config/text/fasttext_filter_pipeline.yaml
430+ --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
431+ --fasttext-quality-model-path={dataset:fasttext_quality_model,bin}
432+ --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
433+ timeout_s : 200
434+ sink_data :
435+ - name : slack
436+ additional_metrics :
437+ - num_kept_documents
438+ - throughput_docs_per_sec
439+ ray :
440+ num_cpus : 64
441+ num_gpus : 0
442+ enable_object_spilling : false
443+
444+ - name : fasttext_filter_xenna
445+ enabled : true
446+ script : fasttext_filter_benchmark.py
447+ args : >-
448+ --benchmark-results-path={session_entry_dir}
449+ --output-path={session_entry_dir}/scratch/output
450+ --executor=xenna
451+ --input-path={dataset:tinystories,parquet}
452+ --yaml-config={curator_repo_dir}/nemo_curator/config/text/fasttext_filter_pipeline.yaml
453+ --fasttext-langid-model-path={dataset:fasttext_langid_model,bin}
454+ --fasttext-quality-model-path={dataset:fasttext_quality_model,bin}
455+ --overrides="stages.0._target_=nemo_curator.stages.text.io.reader.ParquetReader"
456+ timeout_s : 100
457+ sink_data :
458+ - name : slack
459+ additional_metrics :
460+ - num_kept_documents
461+ - throughput_docs_per_sec
462+ ray :
463+ num_cpus : 64
464+ num_gpus : 0
465+ enable_object_spilling : false
466+
414467 - name : modifier_raydata
415468 enabled : true
416469 script : modifier_benchmark.py
@@ -493,7 +546,7 @@ entries:
493546 --benchmark-results-path={session_entry_dir}
494547 --tar-input-path={dataset:arxiv_downloads,tar}
495548 --output-path={session_entry_dir}/scratch/output
496- --fasttext-model-path={dataset:fasttext_model ,bin}
549+ --fasttext-langid- model-path={dataset:fasttext_langid_model ,bin}
497550 --executor=ray_data
498551 timeout_s : 3600
499552 sink_data :
@@ -522,7 +575,7 @@ entries:
522575 --benchmark-results-path={session_entry_dir}
523576 --tar-input-path={dataset:arxiv_downloads,tar}
524577 --output-path={session_entry_dir}/scratch/output
525- --fasttext-model-path={dataset:fasttext_model ,bin}
578+ --fasttext-langid- model-path={dataset:fasttext_langid_model ,bin}
526579 --executor=xenna
527580 timeout_s : 3600
528581 sink_data :
0 commit comments