Remaining Nemotron-CC tutorial fixes (#821)

sarahyurick · web-flow · commit 1061c7a3f09d · 2025-07-17T12:49:33.000-07:00
* Remaining Nemotron-CC tutorial fixes

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* uncomment wiki model downloads

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* fix another file path

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* fix wget links for kenlm

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* add pip jsonlines

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* remove unnecessary noqa

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

* fix fasttext import

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;

---------

Signed-off-by: Sarah Yurick &lt;sarahyurick@gmail.com&gt;
diff --git a/tutorials/nemotron-cc/fast_text_classifier.py b/tutorials/nemotron-cc/fast_text_classifier.py
@@ -1,7 +1,7 @@
 import fasttext
 import numpy as np
 import pandas as pd
-from fasttext import _FastText
+from fasttext.FastText import _FastText
 from huggingface_hub import hf_hub_download
 
 from nemo_curator.datasets import DocumentDataset
diff --git a/tutorials/nemotron-cc/nemotron_cc.ipynb b/tutorials/nemotron-cc/nemotron_cc.ipynb
@@ -114,6 +114,16 @@
     "%env CUDA_VISIBLE_DEVICES 1"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae89d637",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install jsonlines"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 2,
@@ -1686,75 +1696,75 @@
    "outputs": [],
    "source": [
     "\"\"\"\n",
-    "import multiprocessing as mp #noqa: ERA001\n",
-    "import os #noqa: ERA001\n",
-    "from collections import defaultdict #noqa: ERA001\n",
-    "from typing import Any #noqa: ERA001\n",
+    "import multiprocessing as mp\n",
+    "import os\n",
+    "from collections import defaultdict\n",
+    "from typing import Any\n",
     "\n",
-    "import numpy as np #noqa: ERA001\n",
-    "import pandas as pd #noqa: ERA001\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
     "\n",
     "\n",
     "def process_row(args: tuple[int, dict[str, Any]], remove_ex: dict[int, list[tuple[int, int]]]) -> dict[str, Any]:\n",
     "    # Process a single row of data\n",
-    "    idx, row = args #noqa: ERA001\n",
+    "    idx, row = args\n",
     "    new_row = {\n",
-    "        \"id\": row[\"id\"], #noqa: ERA001\n",
-    "        \"text\": row[\"text\"], #noqa: ERA001\n",
+    "        \"id\": row[\"id\"],\n",
+    "        \"text\": row[\"text\"],\n",
     "        # add other columns as needed\n",
     "    }\n",
     "\n",
     "    if idx in remove_ex:\n",
     "        for start, end in remove_ex[idx][::-1]:\n",
-    "            new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:] #noqa: ERA001\n",
+    "            new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:]\n",
     "\n",
-    "    return new_row #noqa: ERA001\n",
+    "    return new_row\n",
     "\n",
     "\n",
     "def process_dataset(\n",
     "    input_path: str, output_path: str, remove_file: str, suffixarray_dir: str, dataset_name: str, split_name: str\n",
     ") -> None:\n",
     "    # Process the dataset using pandas\n",
     "    # Read the input dataset\n",
-    "    df = pd.read_parquet(input_path) #noqa: ERA001\n",
+    "    df = pd.read_parquet(input_path)\n",
     "\n",
     "    # Read removal information\n",
-    "    remove = [] #noqa: ERA01\n",
+    "    remove = []\n",
     "    with open(remove_file) as fin:\n",
     "        for line in fin:\n",
     "            if \"out\" in line:\n",
-    "                break #noqa: ERA001\n",
+    "                break\n",
     "        for line in fin:\n",
-    "            remove.append(list(map(int, line.split()))) #noqa: ERA001\n",
+    "            remove.append(list(map(int, line.split())))\n",
     "\n",
     "    # Read size information\n",
-    "    size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\") #noqa: ERA001\n",
-    "    sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64)  # noqa: ERA001\n",
+    "    size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\")\n",
+    "    sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64)\n",
     "\n",
     "    # Process removal information\n",
-    "    remove_ex = defaultdict(list) #noqa: ERA001\n",
-    "    ptr = 0 #noqa: ERA001\n",
+    "    remove_ex = defaultdict(list)\n",
+    "    ptr = 0\n",
     "    for i, byte_start in enumerate(sizes[:-1]):\n",
-    "        byte_end = sizes[i + 1] #noqa: ERA001\n",
+    "        byte_end = sizes[i + 1]\n",
     "        while ptr < len(remove) and byte_start <= remove[ptr][0] < byte_end:\n",
-    "            assert remove[ptr][1] < byte_end + 6  # noqa: ERA001\n",
+    "            assert remove[ptr][1] < byte_end + 6\n",
     "            remove_ex[i].append(\n",
     "                (\n",
-    "                    max(int(remove[ptr][0] - byte_start - 6), 0), #noqa: ERA001\n",
-    "                    min(int(remove[ptr][1] - byte_start), byte_end - byte_start), #noqa: ERA001\n",
+    "                    max(int(remove[ptr][0] - byte_start - 6), 0),\n",
+    "                    min(int(remove[ptr][1] - byte_start), byte_end - byte_start),\n",
     "                )\n",
     "            )\n",
-    "            ptr += 1 #noqa: ERA001\n",
+    "            ptr += 1\n",
     "\n",
     "    # Process the dataset in parallel\n",
     "    with mp.Pool(mp.cpu_count()) as pool:\n",
-    "        processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex)) #noqa: ERA001\n",
+    "        processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex))\n",
     "\n",
     "    # Create new dataframe with processed rows\n",
-    "    processed_df = pd.DataFrame(processed_rows) #noqa: ERA001\n",
+    "    processed_df = pd.DataFrame(processed_rows)\n",
     "\n",
     "    # Save processed dataset\n",
-    "    processed_df.to_parquet(output_path) #noqa: ERA001\n",
+    "    processed_df.to_parquet(output_path)\n",
     "\n",
     "\n",
     "# Example usage\n",
@@ -2136,7 +2146,7 @@
    "id": "d03de3ab",
    "metadata": {},
    "source": [
-    "Now, let's perform perplexity filtering using a KenLM model trained on wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [HuggingFace](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold."
+    "Now, let's perform perplexity filtering using a KenLM model trained on Wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [Hugging Face](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold."
    ]
   },
   {
@@ -2152,17 +2162,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "bf2829cc",
    "metadata": {},
    "outputs": [],
    "source": [
     "models_dir = os.path.join(cur_dir, \"models\", \"wikipedia\")\n",
     "os.makedirs(models_dir, exist_ok=True)\n",
+    "\n",
     "# Download KenLM pre-trained models\n",
-    "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.arpa.bin -P {models_dir}\n",
-    "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.model -P {models_dir}\n",
-    "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.vocab -P {models_dir}"
+    "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.arpa.bin -P {models_dir}\n",
+    "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.model -P {models_dir}\n",
+    "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.vocab -P {models_dir}"
    ]
   },
   {
@@ -2202,13 +2213,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "id": "b21f0720",
    "metadata": {},
    "outputs": [],
    "source": [
     "# Input\n",
-    "PF_input_data_dir = os.path.join(\"/workspace/nemotron-cc/data/heuristic_filtering/data/hq.parquet\", \"result.parquet\")\n",
+    "PF_input_data_dir = kept_document_dir\n",
     "input_file_type = \"parquet\"\n",
     "batch_size = 1\n",
     "\n",
@@ -2232,7 +2243,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": null,
    "id": "4a844995",
    "metadata": {},
    "outputs": [
@@ -2274,7 +2285,7 @@
     ")\n",
     "\n",
     "perplexity_filter = ScoreFilter(\n",
-    "    PerplexityFilter(threshold=100000.00),\n",
+    "    PerplexityFilter(threshold=threshold),\n",
     "    text_field=\"text\",\n",
     "    score_field=\"perplexity_score\",\n",
     ")\n",
@@ -2567,9 +2578,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "input_dataset = DocumentDataset.read_parquet(\n",
-    "    \"/workspace/nemotron-cc/data/perplexity_filtering/data/hq.parquet/result.parquet\", backend=\"cudf\"\n",
-    ")"
+    "input_dataset = DocumentDataset.read_parquet(kept_document_dir, backend=\"cudf\")"
    ]
   },
   {