Skip to content

Commit 7b94fe2

Browse files
authored
Final fixes for Nemotron-CC tutorial (#829)
* Final fixes for Nemotron-CC tutorial Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * add vineeth's fixes Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> * add missing imports Signed-off-by: Sarah Yurick <sarahyurick@gmail.com> --------- Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
1 parent 1061c7a commit 7b94fe2

File tree

1 file changed

+34
-24
lines changed

1 file changed

+34
-24
lines changed

tutorials/nemotron-cc/nemotron_cc.ipynb

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -495,8 +495,8 @@
495495
},
496496
"outputs": [],
497497
"source": [
498-
"# client = get_client(cluster_type=\"cpu\", n_workers=10, processes=True, memory_limit=\"16GiB\") #noqa: ERA001\n",
499-
"# client"
498+
"client = get_client(cluster_type=\"cpu\", n_workers=10, processes=True, memory_limit=\"16GiB\")\n",
499+
"client"
500500
]
501501
},
502502
{
@@ -742,8 +742,8 @@
742742
},
743743
"outputs": [],
744744
"source": [
745-
"# client.cluster.close() #noqa: ERA001\n",
746-
"# client.shutdown() #noqa: ERA001"
745+
"client.cluster.close()\n",
746+
"client.shutdown()"
747747
]
748748
},
749749
{
@@ -784,13 +784,15 @@
784784
},
785785
{
786786
"cell_type": "code",
787-
"execution_count": 11,
787+
"execution_count": null,
788788
"id": "5f788b91",
789789
"metadata": {
790790
"tags": []
791791
},
792792
"outputs": [],
793793
"source": [
794+
"from dask.distributed import Client, LocalCluster\n",
795+
"\n",
794796
"from nemo_curator import AddId"
795797
]
796798
},
@@ -811,8 +813,8 @@
811813
},
812814
"outputs": [],
813815
"source": [
814-
"# cluster = LocalCluster(n_workers=10, processes=True, memory_limit=\"16GB\") #noqa: ERA001\n",
815-
"# client = Client(cluster) #noqa: ERA001"
816+
"cluster = LocalCluster(n_workers=10, processes=True, memory_limit=\"16GB\")\n",
817+
"client = Client(cluster)"
816818
]
817819
},
818820
{
@@ -893,8 +895,8 @@
893895
},
894896
"outputs": [],
895897
"source": [
896-
"# client.cluster.close() #noqa: ERA001\n",
897-
"# client.shutdown() #noqa: ERA001"
898+
"client.cluster.close()\n",
899+
"client.shutdown()"
898900
]
899901
},
900902
{
@@ -1189,15 +1191,15 @@
11891191
},
11901192
{
11911193
"cell_type": "code",
1192-
"execution_count": 27,
1194+
"execution_count": null,
11931195
"id": "5ef2f05e",
11941196
"metadata": {
11951197
"tags": []
11961198
},
11971199
"outputs": [],
11981200
"source": [
1199-
"# client.cluster.close() #noqa: ERA001\n",
1200-
"# client.shutdown() #noqa: ERA001"
1201+
"client.cluster.close()\n",
1202+
"client.shutdown()"
12011203
]
12021204
},
12031205
{
@@ -2583,7 +2585,7 @@
25832585
},
25842586
{
25852587
"cell_type": "code",
2586-
"execution_count": 11,
2588+
"execution_count": null,
25872589
"id": "6418114a",
25882590
"metadata": {},
25892591
"outputs": [
@@ -2615,7 +2617,7 @@
26152617
}
26162618
],
26172619
"source": [
2618-
"output_dataset = input_dataset\n",
2620+
"output_dataset = DocumentDataset(input_dataset.df.reset_index())\n",
26192621
"for classifier in classifiers:\n",
26202622
" output_dataset = classifier(dataset=output_dataset)\n",
26212623
"\n",
@@ -3185,25 +3187,33 @@
31853187
"!mkdir -p {high_quality_output_data_dir}"
31863188
]
31873189
},
3190+
{
3191+
"cell_type": "markdown",
3192+
"id": "095c99e5",
3193+
"metadata": {},
3194+
"source": [
3195+
"An NVIDIA API token can be obtained from: http://build.nvidia.com/"
3196+
]
3197+
},
31883198
{
31893199
"cell_type": "code",
3190-
"execution_count": 5,
3200+
"execution_count": null,
31913201
"id": "aa31956f",
31923202
"metadata": {},
31933203
"outputs": [],
31943204
"source": [
31953205
"from openai import OpenAI\n",
31963206
"from transformers import AutoTokenizer\n",
31973207
"\n",
3198-
"openai_api_token = \"<your-openai-api-token>\" # noqa: S105\n",
3208+
"nvidia_api_token = \"<your-nv-api-token>\" # noqa: S105\n",
31993209
"hf_token = \"<your-huggingface-token>\" # noqa: S105\n",
32003210
"hf_model_name = \"google/gemma-3-1b-it\"\n",
32013211
"api_model_name = \"google/gemma-3-1b-it\"\n",
32023212
"base_url = \"https://integrate.api.nvidia.com/v1\"\n",
32033213
"\n",
32043214
"\n",
32053215
"tokenizer = AutoTokenizer.from_pretrained(hf_model_name, token=hf_token)\n",
3206-
"openai_client = OpenAI(base_url=base_url, api_key=openai_api_token)"
3216+
"openai_client = OpenAI(base_url=base_url, api_key=nvidia_api_token)"
32073217
]
32083218
},
32093219
{
@@ -3726,7 +3736,7 @@
37263736
"id": "01b076ee",
37273737
"metadata": {},
37283738
"source": [
3729-
"## 5. Visualization"
3739+
"## 5. [Optional] Visualization"
37303740
]
37313741
},
37323742
{
@@ -3765,7 +3775,7 @@
37653775
},
37663776
{
37673777
"cell_type": "code",
3768-
"execution_count": 10,
3778+
"execution_count": null,
37693779
"id": "b23b66e6",
37703780
"metadata": {},
37713781
"outputs": [
@@ -4021,11 +4031,11 @@
40214031
"source": [
40224032
"from viz.text_comparison_widget import compare_row_by_id\n",
40234033
"\n",
4024-
"low_quality_exmaple = \"EN_CC-0000000181\"\n",
4034+
"low_quality_example = \"<pick-one-low-quality-datapoint>\" # ex: EN_CC-0000000181 (based on our runs)\n",
40254035
"\n",
40264036
"compare_row_by_id(\n",
40274037
" final,\n",
4028-
" row_id=low_quality_exmaple, # Change the row id to see different examples\n",
4038+
" row_id=low_quality_example, # Change the row id to see different examples\n",
40294039
" id_column=\"id\",\n",
40304040
" col1=\"text\",\n",
40314041
" col2=\"rephrased\",\n",
@@ -4038,7 +4048,7 @@
40384048
},
40394049
{
40404050
"cell_type": "code",
4041-
"execution_count": 16,
4051+
"execution_count": null,
40424052
"id": "200b87a8",
40434053
"metadata": {},
40444054
"outputs": [
@@ -4360,12 +4370,12 @@
43604370
"source": [
43614371
"from viz.text_comparison_widget import compare_row_by_id\n",
43624372
"\n",
4363-
"high_quality_exmaple = \"EN_CC-0000031171\"\n",
4373+
"high_quality_example = \"<pick-one-high-quality-datapoint>\" # ex: EN_CC-EN_CC-0000031171 (based on our runs)\n",
43644374
"\n",
43654375
"method = \"distill\" # Choose between diverse_qa, distill, extract_knowledge, knowledge_list\n",
43664376
"compare_row_by_id(\n",
43674377
" final,\n",
4368-
" row_id=high_quality_exmaple, # Change the row id to see different examples\n",
4378+
" row_id=high_quality_example, # Change the row id to see different examples\n",
43694379
" id_column=\"id\",\n",
43704380
" col1=\"text\",\n",
43714381
" col2=method,\n",

0 commit comments

Comments
 (0)