The-School-of-AI · pankaj1311 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
@@ -40,7 +40,30 @@ language_and_context:
       - lang: "en"
         max_share: 0.92
     secondary_languages:
-      - lang: ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
+      - lang:
+          [
+            "as",
+            "bn",
+            "gu",
+            "hi",
+            "kn",
+            "ml",
+            "mr",
+            "or",
+            "pa",
+            "ta",
+            "te",
+            "bn_roman",
+            "gu_roman",
+            "hi_roman",
+            "ml_roman",
+            "mr_roman",
+            "or_roman",
+            "pa_roman",
+            "ta_roman",
+            "te_roman",
+            "kn_roman",
+          ]
         max_share: 0.08
         earliest_stage: "1B"
     excluded_languages: ["zh", "ja", "ko", "fr", "de", "es"]
@@ -108,7 +131,16 @@ difficulty_system:
       name: "Nursery"
       intent: "Surface language acquisition"
       allowed_modalities: ["general_text"]
-      allowed_domains: ["web", "social", "qa"]
+      allowed_domains:
+        [
+          "web",
+          "social",
+          "qa",
+          "education",
+          "language_literacy",
+          "conversation",
+          "translation",
+        ]
       constraints:
         tokenizer:
           avg_max: 5000
@@ -123,7 +155,18 @@ difficulty_system:
       name: "Primary"
       intent: "Fluent everyday language"
       allowed_modalities: ["general_text", "clean_exposition"]
-      allowed_domains: ["web", "encyclopedia", "news", "social", "qa"]
+      allowed_domains:
+        [
+          "web",
+          "encyclopedia",
+          "news",
+          "social",
+          "qa",
+          "education",
+          "language_literacy",
+          "conversation",
+          "translation",
+        ]
       constraints:
         tokenizer:
           avg_max: 10000
@@ -139,7 +182,16 @@ difficulty_system:
       intent: "Structured knowledge without explicit reasoning"
       allowed_modalities: ["general_text", "structured_knowledge"]
       allowed_domains:
-        ["encyclopedia", "news", "education", "literature", "web", "qa"]
+        [
+          "encyclopedia",
+          "news",
+          "education",
+          "literature",
+          "web",
+          "qa",
+          "conversation",
+          "translation",
+        ]
       constraints:
         tokenizer:
           avg_max: 20000
@@ -154,7 +206,16 @@ difficulty_system:
       name: "Undergraduate"
       intent: "Reasoning emergence"
       allowed_modalities: ["structured_knowledge", "technical_text", "code"]
-      allowed_domains: ["science", "math", "education", "code", "literature"]
+      allowed_domains:
+        [
+          "science",
+          "math",
+          "education",
+          "code",
+          "literature",
+          "conversation",
+          "translation",
+        ]
       constraints:
         tokenizer:
           avg_max: 40000
@@ -391,7 +452,15 @@ domains:
 
   band_domain_policy:
     B0:
-      ["web", "social", "qa", "education", "language_literacy", "conversation"]
+      [
+        "web",
+        "social",
+        "qa",
+        "education",
+        "language_literacy",
+        "conversation",
+        "translation",
+      ]
     B1:
       [
         "web",
@@ -402,6 +471,7 @@ domains:
         "education",
         "language_literacy",
         "conversation",
+        "translation",
       ]
     B2:
       [
@@ -412,8 +482,18 @@ domains:
         "web",
         "qa",
         "conversation",
+        "translation",
+      ]
+    B3:
+      [
+        "science",
+        "math",
+        "education",
+        "code",
+        "literature",
+        "conversation",
+        "translation",
       ]
-    B3: ["science", "math", "education", "code", "literature", "conversation"]
     B4: ["science", "math", "code", "instruction"]
     B5: ["instruction", "science", "math", "code"]
 

@@ -269,6 +269,7 @@ def _build_stage_coreset(self, stage_name: str, stage_config) -> dict:
                 "byte_length": getattr(all_chunks[cid], "byte_length", 0),
                 "source_doc_id": getattr(all_chunks[cid], "source_doc_id", ""),
                 "source_url": getattr(all_chunks[cid], "source_url", None),
+                "t1_file_path": getattr(all_chunks[cid], "t1_file_path", None),
                 # Many datasets use `source` as the dataset identifier; keep both.
                 "source": getattr(all_chunks[cid], "dataset_id", None)
                 or all_chunks[cid].dataset_id,
@@ -870,13 +871,15 @@ def _base_iter_batches() -> (
                 columns = [
                     "chunk_id",
                     "dataset_id",
+                    "source",
                     "token_count_estimate",
                     "byte_length",
                     "domain",
                     "language",
                     "band",
                     "source_doc_id",
                     "source_url",
+                    "t1_file_path",
                     "token_ids",
                     # Optional continuous score columns used by --band-score-source.
                     "band_score",
@@ -1307,6 +1310,8 @@ def _build_stage_coreset(self, stage_name: str, stage_config) -> dict:
                             or meta_dict.get("source_doc_id", ""),
                             source_url=row.get("source_url", None)
                             or meta_dict.get("source_url", None),
+                            t1_file_path=row.get("t1_file_path", None)
+                            or meta_dict.get("t1_file_path", None),
                         )
 
                         # Preserve raw input source when available (some datasets distinguish dataset_id vs source).
@@ -1420,6 +1425,7 @@ def _build_stage_coreset(self, stage_name: str, stage_config) -> dict:
                                 ),
                                 "source_doc_id": getattr(meta, "source_doc_id", ""),
                                 "source_url": getattr(meta, "source_url", None),
+                                "t1_file_path": getattr(meta, "t1_file_path", None),
                                 # Preserve original `source` when present; fallback to dataset_id.
                                 "source": getattr(meta, "source", None)
                                 or meta.dataset_id,

@@ -95,7 +95,7 @@ print(df.head())
 **Sample Columns (typical):**
 
 - `chunk_id`, `dataset_id`, `token_count`, `domain`, `language`, `band`
-- `byte_length`, `source_doc_id`, `source_url`
+- `byte_length`, `source_doc_id`, `source_url`, `t1_file_path`
 - `source` (when available)
 
 
@@ -120,7 +120,7 @@ with open("output/coresets/1B/selected_indices.jsonl") as f:
 **Sample Output (schema-aligned):**
 
 ```json
-{"chunk_id":"ch_001","dataset_id":"books","source":"books","token_count":2048,"byte_length":6463,"domain":"literature","language":"en","band":"B0","source_doc_id":"part-00000-...parquet","source_url":"s3://..."}
+{"chunk_id":"ch_001","dataset_id":"books","source":"books","token_count":2048,"byte_length":6463,"domain":"literature","language":"en","band":"B0","source_doc_id":"part-00000-...parquet","source_url":"s3://...","t1_file_path":"s3://t1-raw/.../file.parquet"}
 ```
 
 ### CSV Format
@@ -140,8 +140,8 @@ df = pd.read_csv("output/coresets/1B/selected_indices.csv")
 **Sample Output (schema-aligned):**
 
 ```csv
-chunk_id,dataset_id,source,token_count,byte_length,domain,language,band,source_doc_id,source_url
-ch_001,books,books,2048,2048,6463,literature,en,B0,part-00000-...parquet,s3://...
+chunk_id,dataset_id,source,token_count,byte_length,domain,language,band,source_doc_id,source_url,t1_file_path
+ch_001,books,books,2048,2048,6463,literature,en,B0,part-00000-...parquet,s3://...,s3://t1-raw/.../file.parquet
 ```
 
 ## Configuration Examples
@@ -204,8 +204,9 @@ Each row/object contains:
 - **source**: Original dataset source label when provided (often same as dataset_id)
 - **source_doc_id**: Document source file name
 - **source_url**: URL if available
+- **t1_file_path**: Path/URI to the original raw source file (recorded by the T1 dataset team) that contains the raw data for this chunk
 
-* source_url+source_doc_id -->  Leads to the source dataset file and then use chunk_id to pull the exact record data (Raw dataset)
+* `t1_file_path` and/or `source_url`+`source_doc_id` can be used for traceability to the source dataset; use `chunk_id` to locate the exact record within that source.
 
 ## Performance Comparison
 

@@ -13,7 +13,7 @@ Defines the **expected upstream chunk schema** consumed by the coreset pipeline
 > 1) **Metadata-only chunk pool** (like `data/outputv2/b0_shard_0.jsonl`): IDs + band/domain/language + counts + band probabilities/scores.
 > 2) **Text/tokens present**: includes `chunk_text` and/or `token_ids` to enable real dedup and richer diversity scoring.
 >
-> Band inference is controlled by the streaming entrypoint flags `--band-inference` and `--band-score-source` (sometimes described informally as “band inference” / “band source score”).
+> Band inference is controll/ed by the streaming entrypoint flags `--band-inference` and `--band-score-source` (sometimes described informally as “band inference” / “band source score”).
 
 ### Required fields
 
@@ -37,6 +37,7 @@ Defines the **expected upstream chunk schema** consumed by the coreset pipeline
 | `dataset_id` (or `source`) | string | Traceability/output | JSONL defaults to `"ds"` (aliases: `dataset_id` or `source` or `metadata.source`) |
 | `byte_length` | int | Traceability/output | Defaults to `0` |
 | `source_doc_id` | string | Traceability/output | Should be provided; otherwise empty/missing propagates |
+| `t1_file_path` | string | Traceability/output | Optional. When provided by the T1 dataset team, points to the original raw source file containing the record (distinct from `source_doc_id`, which is typically the processed Parquet part filename). |
 | `source_url` | string | Traceability/output | Optional |
 | `quality_flags` | list[str] | Output metadata | Defaults to `[]` |
 | `sensitive_markers` | list[str] | Output metadata | Defaults to `[]` |
@@ -48,7 +49,7 @@ This file is a **metadata-only** chunk pool: it does **not** include `chunk_text
 
 ### Columns present (verbatim)
 
-`agentic_score`, `band`, `band_p_B0`, `band_p_B1`, `band_p_B2`, `band_p_B3`, `band_p_B4`, `band_p_B5`, `band_score`, `byte_length`, `chunk_id`, `code_score`, `compression_ratio`, `cot_score`, `difficulty_score`, `domain`, `fertility_estimate`, `has_agentic`, `has_code`, `has_cot`, `has_reasoning`, `language`, `math_score`, `reasoning_score`, `source`, `source_doc_id`, `source_url`, `token_count_estimate`, `unique_token_ratio`, `word_count`.
+`agentic_score`, `band`, `band_p_B0`, `band_p_B1`, `band_p_B2`, `band_p_B3`, `band_p_B4`, `band_p_B5`, `band_score`, `byte_length`, `chunk_id`, `code_score`, `compression_ratio`, `cot_score`, `difficulty_score`, `domain`, `fertility_estimate`, `has_agentic`, `has_code`, `has_cot`, `has_reasoning`, `language`, `math_score`, `reasoning_score`, `source`, `source_doc_id`, `source_url`, `t1_file_path`, `token_count_estimate`, `unique_token_ratio`, `word_count`.
 
 ### What the pipeline consumes from these columns
 
@@ -63,7 +64,8 @@ This file is a **metadata-only** chunk pool: it does **not** include `chunk_text
 - `band` → `ChunkMetadata.band`
 - `source_doc_id` → `ChunkMetadata.source_doc_id`
 - `source_url` → `ChunkMetadata.source_url`
-- `band_score` → attached dynamically as `metadata.band_score` (used for ranking when present)
+- `t1_file_path` → attached dynamically as `ChunkMetadata.t1_file_path` (propagates to selected-indices output when present)
+- `band_score` → attached dynamically as `ChunkMetadata.band_score` (used for ranking when present)
 
 When running the streaming entrypoint `coreset_builder.py` with `--band-inference` enabled (anything other than `none`), the builder may also read `difficulty_score` and/or `band_p_B0..band_p_B6` (per `--band-score-source`) to:
 
@@ -76,12 +78,12 @@ Other fields in this file (e.g., `has_code`, `*_score`, `word_count`, `unique_to
 
 Flat record (recommended):
 ```json
-{"chunk_id":"ch_001","dataset_id":"books","token_count_estimate":2048,"byte_length":9876,"domain":"clean_web","language":"en","band":"B2","source_doc_id":"part-00000","source_url":"s3://...","token_ids":[1,2,3]}
+{"chunk_id":"ch_001","dataset_id":"books","token_count_estimate":2048,"byte_length":9876,"domain":"clean_web","language":"en","band":"B2","source_doc_id":"part-00000","source_url":"s3://...","t1_file_path":"s3://t1-raw/.../file.parquet","token_ids":[1,2,3]}
 ```
 
 Nested metadata (accepted):
 ```json
-{"uid":"ch_001","token_count":2048,"metadata":{"source":"books","domain":"clean_web","language":"en","band":"B2","source_doc_id":"part-00000"}}
+{"uid":"ch_001","token_count":2048,"metadata":{"source":"books","domain":"clean_web","language":"en","band":"B2","source_doc_id":"part-00000","t1_file_path":"s3://t1-raw/.../file.parquet"}}
 ```
 
 ## Parquet: minimum viable columns
@@ -90,4 +92,4 @@ Required columns:
 - `chunk_id`, `dataset_id`, `domain`, `language`, `band`, `byte_length`, `source_doc_id`, and one of `token_count`/`token_count_estimate`
 
 Optional columns:
-- `source_url`, `quality_flags`, `sensitive_markers`, `start_offset`, `token_ids`
+- `source_url`, `t1_file_path`, `quality_flags`, `sensitive_markers`, `start_offset`, `token_ids`