66# Paths can be used in other values in this file by using their placeholder
77# (e.g. {datasets_path}/my/test/dataset.parquet) and will be resolved to the
88# appropriate path at runtime.
9- results_path : /raid/curator-team/nightly/ results
10- artifacts_path : /raid/curator-team/nightly/ artifacts
11- datasets_path : /raid
9+ results_path : /path/where/ results/are/stored
10+ artifacts_path : /path/where/ artifacts/are/stored
11+ datasets_path : /path/to/datasets
1212
1313datasets :
14- - name : " tinystories_train "
14+ - name : " tinystories "
1515 formats :
1616 - type : " parquet"
17- path : " {datasets_path}/prospector-lm/clean/tinystories_train_parquet"
17+ path : " {datasets_path}/tinystories/parquet_data"
18+ - type : " jsonl"
19+ path : " {datasets_path}/tinystories/jsonl_data"
20+ - name : " commoncrawl"
21+ formats :
22+ - type : " jsonl"
23+ path : " {datasets_path}/commoncrawl/jsonl_data"
24+ - name : " commoncrawl_id_map"
25+ formats :
26+ - type : " json"
27+ path : " {datasets_path}/commoncrawl/id_generator.json"
28+ - name : " commoncrawl_ids"
29+ formats :
30+ - type : " parquet"
31+ path : " {datasets_path}/commoncrawl/IDs/parquet_data"
32+
1833
1934default_timeout_s : 7200
2035
@@ -42,7 +57,7 @@ entries:
4257 script : domain_classification_benchmark.py
4358 args : >-
4459 --executor=ray_data
45- --input-path={dataset:tinystories_train ,parquet}
60+ --input-path={dataset:tinystories ,parquet}
4661 --dataset-size-gb=10
4762 --model-inference-batch-size=1024
4863 timeout_s : 20000
@@ -54,17 +69,17 @@ entries:
5469 num_cpus : 64
5570 num_gpus : 4
5671 enable_object_spilling : false
57- # Optional: Requirements for the benchmark to pass. These will result in the benchmark being marked as failed if not met.
72+ # Additional requirements for the benchmark to pass. These will result in the benchmark being marked as failed if not met.
5873 requirements :
5974 - metric : throughput_docs_per_sec
6075 min_value : 0.2
6176
6277 - name : domain_classification_xenna
63- enabled : false
78+ enabled : true
6479 script : domain_classification_benchmark.py
6580 args : >-
6681 --executor=xenna
67- --input-path={dataset:tinystories_train ,parquet}
82+ --input-path={dataset:tinystories ,parquet}
6883 --dataset-size-gb=10
6984 --model-inference-batch-size=1024
7085 timeout_s : 20000
@@ -74,65 +89,79 @@ entries:
7489 script : embedding_generation_benchmark.py
7590 args : >-
7691 --executor=ray_data
77- --input-path={dataset:tinystories_train ,parquet}
92+ --input-path={dataset:tinystories ,parquet}
7893 --dataset-size-gb=10
7994 --model-identifier=sentence-transformers/all-MiniLM-L6-v2
8095 --model-inference-batch-size=1024
8196 timeout_s : 20000
8297 sink_data :
8398 - name : slack
84- # Additional metrics to include in the Slack report. These must be present in the metrics.json file generated by the script.
8599 additional_metrics : ["num_documents_processed", "throughput_docs_per_sec"]
86100 ray :
87101 num_cpus : 64
88102 num_gpus : 4
89103 enable_object_spilling : false
90104
91105 - name : embedding_generation_xenna
92- enabled : false
106+ enabled : true
93107 script : embedding_generation_benchmark.py
94108 args : >-
95109 --executor=xenna
96- --input-path={dataset:tinystories_train ,parquet}
110+ --input-path={dataset:tinystories ,parquet}
97111 --dataset-size-gb=10
98112 --model-identifier=sentence-transformers/all-MiniLM-L6-v2
99113 --model-inference-batch-size=1024
100114 timeout_s : 20000
101115
102- - name : removal_raydata
103- enabled : false
104- script : removal_benchmark .py
116+ - name : fuzzy_dedup_identification
117+ enabled : true
118+ script : fuzzy_dedup_identification_benchmark .py
105119 args : >-
106- --executor=ray_data
107- --input-path={dataset:tinystories_train,parquet}
108- --ids-to-remove-path=some_path
109- --id-generator-path=some_path
120+ --input-path={dataset:commoncrawl,jsonl}
121+ --cache-path={session_entry_dir}/scratch/cache
122+ --output-path={session_entry_dir}/output
123+ --input-filetype=jsonl
124+ --bands-per-iteration=20
125+ --text-field=text
126+ --input-blocksize=1.5GiB
127+ timeout_s : 20000
128+ ray :
129+ num_cpus : 64
130+ num_gpus : 4
131+ enable_object_spilling : false
132+
133+ - name : dedup_removal_raydata
134+ enabled : true
135+ script : dedup_removal_benchmark.py
136+ args : >-
137+ --input-path={dataset:commoncrawl,jsonl}
138+ --id-generator-path={dataset:commoncrawl_id_map,json}
139+ --ids-to-remove-path={dataset:commoncrawl_ids,parquet}
110140 --output-path={session_entry_dir}/scratch/output
111- --input-filetype=parquet
112- --input-fields=id,text
113- --input-id-field=CURATOR_DEDUP_ID_STR
114- --input-files-per-partition=1
115- --ids-to-remove-fields=id
141+ --executor=ray_data
142+ --input-filetype=jsonl
116143 --output-filetype=parquet
144+ --id-field=_curator_dedup_id
145+ --duplicate-id-field=_curator_dedup_id
146+ --blocksize=1.5GiB
117147 timeout_s : 20000
118148 ray :
119149 num_cpus : 64
120150 num_gpus : 4
121151 enable_object_spilling : false
122152
123- - name : removal_xenna
124- enabled : false
125- script : removal_benchmark .py
153+ - name : dedup_removal_xenna
154+ enabled : true
155+ script : dedup_removal_benchmark .py
126156 args : >-
127- --executor=xenna
128- --input-path={dataset:tinystories_train,parquet}
129- --ids-to-remove-path=some_path
130- --id-generator-path=some_path
157+ --input-path={dataset:commoncrawl,jsonl}
158+ --id-generator-path={dataset:commoncrawl_id_map,json}
159+ --ids-to-remove-path={dataset:commoncrawl_ids,parquet}
131160 --output-path={session_entry_dir}/scratch/output
132- --input-filetype=parquet
133- --input-fields=id,text
134- --input-id-field=CURATOR_DEDUP_ID_STR
135- --input-files-per-partition=1
136- --ids-to-remove-fields=id
161+ --executor=xenna
162+ --input-filetype=jsonl
137163 --output-filetype=parquet
164+ --id-field=_curator_dedup_id
165+ --duplicate-id-field=_curator_dedup_id
166+ --blocksize=1.5GiB
138167 timeout_s : 20000
0 commit comments