Skip to content

Commit 3eee423

Browse files
committed
Added new domains - Translation, Golden and also update the new domains in the band section
1 parent d349b18 commit 3eee423

File tree

7 files changed

+25
-32
lines changed

7 files changed

+25
-32
lines changed

experiments/3_coreset_engineering/coreset_engine_v5/config/curriculum.yaml

Lines changed: 16 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ difficulty_system:
108108
name: "Nursery"
109109
intent: "Surface language acquisition"
110110
allowed_modalities: ["general_text"]
111-
allowed_domains: ["web", "social", "qa"]
111+
allowed_domains: ["web", "social", "qa", "education", "language_literacy", "conversation", "translation","golden"]
112112
constraints:
113113
tokenizer:
114114
avg_max: 5000
@@ -123,7 +123,7 @@ difficulty_system:
123123
name: "Primary"
124124
intent: "Fluent everyday language"
125125
allowed_modalities: ["general_text", "clean_exposition"]
126-
allowed_domains: ["web", "encyclopedia", "news", "social", "qa"]
126+
allowed_domains: ["web","encyclopedia","news","social","qa","education","language_literacy","conversation","translation","golden"]
127127
constraints:
128128
tokenizer:
129129
avg_max: 10000
@@ -138,8 +138,7 @@ difficulty_system:
138138
name: "HighSchool"
139139
intent: "Structured knowledge without explicit reasoning"
140140
allowed_modalities: ["general_text", "structured_knowledge"]
141-
allowed_domains:
142-
["encyclopedia", "news", "education", "literature", "web", "qa"]
141+
allowed_domains: ["encyclopedia", "news","education", "literature","web","qa", "conversation","translation","golden"]
143142
constraints:
144143
tokenizer:
145144
avg_max: 20000
@@ -154,7 +153,7 @@ difficulty_system:
154153
name: "Undergraduate"
155154
intent: "Reasoning emergence"
156155
allowed_modalities: ["structured_knowledge", "technical_text", "code"]
157-
allowed_domains: ["science", "math", "education", "code", "literature"]
156+
allowed_domains: ["science", "math", "education", "code", "literature", "conversation","translation","golden"]
158157
constraints:
159158
tokenizer:
160159
avg_max: 40000
@@ -171,7 +170,7 @@ difficulty_system:
171170
name: "Graduate"
172171
intent: "Explicit abstraction and algorithms"
173172
allowed_modalities: ["technical_text", "math", "code"]
174-
allowed_domains: ["science", "math", "code", "instruction"]
173+
allowed_domains: ["science", "math", "code", "instruction","golden"]
175174
constraints:
176175
tokenizer:
177176
avg_max: 70000
@@ -187,9 +186,8 @@ difficulty_system:
187186
B5:
188187
name: "PhD"
189188
intent: "Planning and system-level reasoning"
190-
allowed_modalities:
191-
["hard_reasoning", "math", "advanced_code", "planning"]
192-
allowed_domains: ["instruction", "science", "math", "code"]
189+
allowed_modalities: ["hard_reasoning", "math", "advanced_code", "planning"]
190+
allowed_domains: ["instruction", "science", "math", "code","golden"]
193191
constraints:
194192
tokenizer:
195193
avg_max: .inf
@@ -391,31 +389,17 @@ domains:
391389

392390
band_domain_policy:
393391
B0:
394-
["web", "social", "qa", "education", "language_literacy", "conversation"]
392+
["web", "social", "qa", "education", "language_literacy", "conversation", "translation","golden"]
395393
B1:
396-
[
397-
"web",
398-
"encyclopedia",
399-
"news",
400-
"social",
401-
"qa",
402-
"education",
403-
"language_literacy",
404-
"conversation",
405-
]
394+
["web","encyclopedia","news","social","qa","education","language_literacy","conversation","translation","golden"]
406395
B2:
407-
[
408-
"encyclopedia",
409-
"news",
410-
"education",
411-
"literature",
412-
"web",
413-
"qa",
414-
"conversation",
415-
]
416-
B3: ["science", "math", "education", "code", "literature", "conversation"]
417-
B4: ["science", "math", "code", "instruction"]
418-
B5: ["instruction", "science", "math", "code"]
396+
["encyclopedia", "news","education", "literature","web","qa", "conversation","translation","golden"]
397+
B3:
398+
["science", "math", "education", "code", "literature", "conversation","translation","golden"]
399+
B4:
400+
["science", "math", "code", "instruction","golden"]
401+
B5:
402+
["instruction", "science", "math", "code","golden"]
419403

420404
# ============================================================
421405
# EVALUATION SIGNAL INTERFACE

experiments/3_coreset_engineering/coreset_engine_v5/tests/test_optimizations.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,7 @@ def chunk_generator():
300300
language="en",
301301
band=DifficultyBand("B2"),
302302
source_doc_id="doc_1",
303+
t1_file_path="doc_2.parquet"
303304
)
304305
yield (f"chunk_{i:05d}", metadata)
305306

experiments/3_coreset_engineering/coreset_engine_v5/tests/test_pipeline.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ def test_chunk_metadata_creation(self):
8787
language="en",
8888
band=DifficultyBand.B3,
8989
source_doc_id="doc_001",
90+
t1_file_path="doc_2.parquet",
9091
)
9192
assert metadata.chunk_id == "chunk_001"
9293
assert metadata.band == DifficultyBand.B3
@@ -332,6 +333,7 @@ def test_selection_using_real_sample(self, tmp_path):
332333
band=DifficultyBand(data.get("band", "B0")),
333334
source_doc_id=data.get("source_doc_id", ""),
334335
source_url=data.get("source_url", None),
336+
t1_file_path="doc_2.parquet",
335337
)
336338
all_chunks[chunk_id] = meta
337339

@@ -417,6 +419,7 @@ def test_selection_using_large_sample(self):
417419
band=DifficultyBand(data.get("band", "B0")),
418420
source_doc_id=data.get("source_doc_id", ""),
419421
source_url=data.get("source_url", None),
422+
t1_file_path="doc_2.parquet",
420423
)
421424
if "token_ids" in data:
422425
setattr(meta, "token_ids", list(data["token_ids"]))

experiments/3_coreset_engineering/coreset_engine_v5/tests/test_protected_slice_domain_eligibility.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def test_protected_slice_enforcement_respects_band_allowed_domains(
4646
band=DifficultyBand.B4,
4747
source_doc_id="doc",
4848
source_url=None,
49+
t1_file_path="doc_2.parquet",
4950
),
5051
"c002": ChunkMetadata(
5152
chunk_id="c002",
@@ -57,6 +58,7 @@ def test_protected_slice_enforcement_respects_band_allowed_domains(
5758
band=DifficultyBand.B4,
5859
source_doc_id="doc",
5960
source_url=None,
61+
t1_file_path="doc_2.parquet",
6062
),
6163
}
6264

experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_curriculum_aligned_sample.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ def generate(
166166
"quality_flags": [],
167167
"sensitive_markers": [],
168168
"start_offset": 0,
169+
"t1_file_path":"doc_2.parquet",
169170
}
170171
f.write(json.dumps(row) + "\n")
171172

experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_large_sample.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def generate(out_path: Path, n: int, seed: int = 42):
3636
"sensitive_markers": [],
3737
"start_offset": 0,
3838
"token_ids": token_ids,
39+
"t1_file_path":"doc_2.parquet"
3940
}
4041
f.write(json.dumps(chunk) + "\n")
4142

experiments/3_coreset_engineering/coreset_engine_v5/tools/merge_selected_indices.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"source_doc_id",
3838
"source_url",
3939
"source",
40+
"t1_file_path",
4041
]
4142

4243
LEGACY_TOKEN_COLUMN = "token_count_estimate"

0 commit comments

Comments
 (0)