Added new domains - Translation, Golden and also update the new domains in the band section

sidrocks · sidrocks · commit 3eee42386f8d · 2026-03-05T01:35:22.000+05:30
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/config/curriculum.yaml b/experiments/3_coreset_engineering/coreset_engine_v5/config/curriculum.yaml
@@ -108,7 +108,7 @@ difficulty_system:
       name: "Nursery"
       intent: "Surface language acquisition"
       allowed_modalities: ["general_text"]
-      allowed_domains: ["web", "social", "qa"]
+      allowed_domains: ["web", "social", "qa", "education", "language_literacy", "conversation", "translation","golden"]
       constraints:
         tokenizer:
           avg_max: 5000
@@ -123,7 +123,7 @@ difficulty_system:
       name: "Primary"
       intent: "Fluent everyday language"
       allowed_modalities: ["general_text", "clean_exposition"]
-      allowed_domains: ["web", "encyclopedia", "news", "social", "qa"]
+      allowed_domains: ["web","encyclopedia","news","social","qa","education","language_literacy","conversation","translation","golden"]
       constraints:
         tokenizer:
           avg_max: 10000
@@ -138,8 +138,7 @@ difficulty_system:
       name: "HighSchool"
       intent: "Structured knowledge without explicit reasoning"
       allowed_modalities: ["general_text", "structured_knowledge"]
-      allowed_domains:
-        ["encyclopedia", "news", "education", "literature", "web", "qa"]
+      allowed_domains: ["encyclopedia", "news","education", "literature","web","qa", "conversation","translation","golden"]
       constraints:
         tokenizer:
           avg_max: 20000
@@ -154,7 +153,7 @@ difficulty_system:
       name: "Undergraduate"
       intent: "Reasoning emergence"
       allowed_modalities: ["structured_knowledge", "technical_text", "code"]
-      allowed_domains: ["science", "math", "education", "code", "literature"]
+      allowed_domains:  ["science", "math", "education", "code", "literature", "conversation","translation","golden"]
       constraints:
         tokenizer:
           avg_max: 40000
@@ -171,7 +170,7 @@ difficulty_system:
       name: "Graduate"
       intent: "Explicit abstraction and algorithms"
       allowed_modalities: ["technical_text", "math", "code"]
-      allowed_domains: ["science", "math", "code", "instruction"]
+      allowed_domains: ["science", "math", "code", "instruction","golden"]
       constraints:
         tokenizer:
           avg_max: 70000
@@ -187,9 +186,8 @@ difficulty_system:
     B5:
       name: "PhD"
       intent: "Planning and system-level reasoning"
-      allowed_modalities:
-        ["hard_reasoning", "math", "advanced_code", "planning"]
-      allowed_domains: ["instruction", "science", "math", "code"]
+      allowed_modalities: ["hard_reasoning", "math", "advanced_code", "planning"]
+      allowed_domains: ["instruction", "science", "math", "code","golden"]
       constraints:
         tokenizer:
           avg_max: .inf
@@ -391,31 +389,17 @@ domains:
 
   band_domain_policy:
     B0:
-      ["web", "social", "qa", "education", "language_literacy", "conversation"]
+      ["web", "social", "qa", "education", "language_literacy", "conversation", "translation","golden"]
     B1:
-      [
-        "web",
-        "encyclopedia",
-        "news",
-        "social",
-        "qa",
-        "education",
-        "language_literacy",
-        "conversation",
-      ]
+      ["web","encyclopedia","news","social","qa","education","language_literacy","conversation","translation","golden"]
     B2:
-      [
-        "encyclopedia",
-        "news",
-        "education",
-        "literature",
-        "web",
-        "qa",
-        "conversation",
-      ]
-    B3: ["science", "math", "education", "code", "literature", "conversation"]
-    B4: ["science", "math", "code", "instruction"]
-    B5: ["instruction", "science", "math", "code"]
+      ["encyclopedia", "news","education", "literature","web","qa", "conversation","translation","golden"]
+    B3: 
+      ["science", "math", "education", "code", "literature", "conversation","translation","golden"]
+    B4: 
+      ["science", "math", "code", "instruction","golden"]
+    B5:
+      ["instruction", "science", "math", "code","golden"]
 
 # ============================================================
 # EVALUATION SIGNAL INTERFACE
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_optimizations.py b/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_optimizations.py
@@ -300,6 +300,7 @@ def chunk_generator():
                     language="en",
                     band=DifficultyBand("B2"),
                     source_doc_id="doc_1",
+                    t1_file_path="doc_2.parquet"
                 )
                 yield (f"chunk_{i:05d}", metadata)
 
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_pipeline.py b/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_pipeline.py
@@ -87,6 +87,7 @@ def test_chunk_metadata_creation(self):
             language="en",
             band=DifficultyBand.B3,
             source_doc_id="doc_001",
+            t1_file_path="doc_2.parquet",
         )
         assert metadata.chunk_id == "chunk_001"
         assert metadata.band == DifficultyBand.B3
@@ -332,6 +333,7 @@ def test_selection_using_real_sample(self, tmp_path):
                     band=DifficultyBand(data.get("band", "B0")),
                     source_doc_id=data.get("source_doc_id", ""),
                     source_url=data.get("source_url", None),
+                    t1_file_path="doc_2.parquet",
                 )
                 all_chunks[chunk_id] = meta
 
@@ -417,6 +419,7 @@ def test_selection_using_large_sample(self):
                 band=DifficultyBand(data.get("band", "B0")),
                 source_doc_id=data.get("source_doc_id", ""),
                 source_url=data.get("source_url", None),
+                t1_file_path="doc_2.parquet",
             )
             if "token_ids" in data:
                 setattr(meta, "token_ids", list(data["token_ids"]))
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_protected_slice_domain_eligibility.py b/experiments/3_coreset_engineering/coreset_engine_v5/tests/test_protected_slice_domain_eligibility.py
@@ -46,6 +46,7 @@ def test_protected_slice_enforcement_respects_band_allowed_domains(
             band=DifficultyBand.B4,
             source_doc_id="doc",
             source_url=None,
+            t1_file_path="doc_2.parquet",
         ),
         "c002": ChunkMetadata(
             chunk_id="c002",
@@ -57,6 +58,7 @@ def test_protected_slice_enforcement_respects_band_allowed_domains(
             band=DifficultyBand.B4,
             source_doc_id="doc",
             source_url=None,
+            t1_file_path="doc_2.parquet",
         ),
     }
 
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_curriculum_aligned_sample.py b/experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_curriculum_aligned_sample.py
@@ -166,6 +166,7 @@ def generate(
                 "quality_flags": [],
                 "sensitive_markers": [],
                 "start_offset": 0,
+                "t1_file_path":"doc_2.parquet",
             }
             f.write(json.dumps(row) + "\n")
 
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_large_sample.py b/experiments/3_coreset_engineering/coreset_engine_v5/tools/generate_large_sample.py
@@ -36,6 +36,7 @@ def generate(out_path: Path, n: int, seed: int = 42):
                 "sensitive_markers": [],
                 "start_offset": 0,
                 "token_ids": token_ids,
+                "t1_file_path":"doc_2.parquet"
             }
             f.write(json.dumps(chunk) + "\n")
 
diff --git a/experiments/3_coreset_engineering/coreset_engine_v5/tools/merge_selected_indices.py b/experiments/3_coreset_engineering/coreset_engine_v5/tools/merge_selected_indices.py
@@ -37,6 +37,7 @@
     "source_doc_id",
     "source_url",
     "source",
+    "t1_file_path",
 ]
 
 LEGACY_TOKEN_COLUMN = "token_count_estimate"

Original file line number	Diff line number	Diff line change
`@@ -300,6 +300,7 @@ def chunk_generator():`
`300`	`300`	`language="en",`
`301`	`301`	`band=DifficultyBand("B2"),`
`302`	`302`	`source_doc_id="doc_1",`
	`303`	`+ t1_file_path="doc_2.parquet"`
`303`	`304`	`)`
`304`	`305`	`yield (f"chunk_{i:05d}", metadata)`
`305`	`306`
Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,7 @@ def generate(`
`166`	`166`	`"quality_flags": [],`
`167`	`167`	`"sensitive_markers": [],`
`168`	`168`	`"start_offset": 0,`
	`169`	`+ "t1_file_path":"doc_2.parquet",`
`169`	`170`	`}`
`170`	`171`	`f.write(json.dumps(row) + "\n")`
`171`	`172`
Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,7 @@ def generate(out_path: Path, n: int, seed: int = 42):`
`36`	`36`	`"sensitive_markers": [],`
`37`	`37`	`"start_offset": 0,`
`38`	`38`	`"token_ids": token_ids,`
	`39`	`+ "t1_file_path":"doc_2.parquet"`
`39`	`40`	`}`
`40`	`41`	`f.write(json.dumps(chunk) + "\n")`
`41`	`42`
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@`
`37`	`37`	`"source_doc_id",`
`38`	`38`	`"source_url",`
`39`	`39`	`"source",`
	`40`	`+ "t1_file_path",`
`40`	`41`	`]`
`41`	`42`
`42`	`43`	`LEGACY_TOKEN_COLUMN = "token_count_estimate"`