|
495 | 495 | }, |
496 | 496 | "outputs": [], |
497 | 497 | "source": [ |
498 | | - "# client = get_client(cluster_type=\"cpu\", n_workers=10, processes=True, memory_limit=\"16GiB\") #noqa: ERA001\n", |
499 | | - "# client" |
| 498 | + "client = get_client(cluster_type=\"cpu\", n_workers=10, processes=True, memory_limit=\"16GiB\")\n", |
| 499 | + "client" |
500 | 500 | ] |
501 | 501 | }, |
502 | 502 | { |
|
742 | 742 | }, |
743 | 743 | "outputs": [], |
744 | 744 | "source": [ |
745 | | - "# client.cluster.close() #noqa: ERA001\n", |
746 | | - "# client.shutdown() #noqa: ERA001" |
| 745 | + "client.cluster.close()\n", |
| 746 | + "client.shutdown()" |
747 | 747 | ] |
748 | 748 | }, |
749 | 749 | { |
|
784 | 784 | }, |
785 | 785 | { |
786 | 786 | "cell_type": "code", |
787 | | - "execution_count": 11, |
| 787 | + "execution_count": null, |
788 | 788 | "id": "5f788b91", |
789 | 789 | "metadata": { |
790 | 790 | "tags": [] |
791 | 791 | }, |
792 | 792 | "outputs": [], |
793 | 793 | "source": [ |
| 794 | + "from dask.distributed import Client, LocalCluster\n", |
| 795 | + "\n", |
794 | 796 | "from nemo_curator import AddId" |
795 | 797 | ] |
796 | 798 | }, |
|
811 | 813 | }, |
812 | 814 | "outputs": [], |
813 | 815 | "source": [ |
814 | | - "# cluster = LocalCluster(n_workers=10, processes=True, memory_limit=\"16GB\") #noqa: ERA001\n", |
815 | | - "# client = Client(cluster) #noqa: ERA001" |
| 816 | + "cluster = LocalCluster(n_workers=10, processes=True, memory_limit=\"16GB\")\n", |
| 817 | + "client = Client(cluster)" |
816 | 818 | ] |
817 | 819 | }, |
818 | 820 | { |
|
893 | 895 | }, |
894 | 896 | "outputs": [], |
895 | 897 | "source": [ |
896 | | - "# client.cluster.close() #noqa: ERA001\n", |
897 | | - "# client.shutdown() #noqa: ERA001" |
| 898 | + "client.cluster.close()\n", |
| 899 | + "client.shutdown()" |
898 | 900 | ] |
899 | 901 | }, |
900 | 902 | { |
|
1189 | 1191 | }, |
1190 | 1192 | { |
1191 | 1193 | "cell_type": "code", |
1192 | | - "execution_count": 27, |
| 1194 | + "execution_count": null, |
1193 | 1195 | "id": "5ef2f05e", |
1194 | 1196 | "metadata": { |
1195 | 1197 | "tags": [] |
1196 | 1198 | }, |
1197 | 1199 | "outputs": [], |
1198 | 1200 | "source": [ |
1199 | | - "# client.cluster.close() #noqa: ERA001\n", |
1200 | | - "# client.shutdown() #noqa: ERA001" |
| 1201 | + "client.cluster.close()\n", |
| 1202 | + "client.shutdown()" |
1201 | 1203 | ] |
1202 | 1204 | }, |
1203 | 1205 | { |
|
2583 | 2585 | }, |
2584 | 2586 | { |
2585 | 2587 | "cell_type": "code", |
2586 | | - "execution_count": 11, |
| 2588 | + "execution_count": null, |
2587 | 2589 | "id": "6418114a", |
2588 | 2590 | "metadata": {}, |
2589 | 2591 | "outputs": [ |
|
2615 | 2617 | } |
2616 | 2618 | ], |
2617 | 2619 | "source": [ |
2618 | | - "output_dataset = input_dataset\n", |
| 2620 | + "output_dataset = DocumentDataset(input_dataset.df.reset_index())\n", |
2619 | 2621 | "for classifier in classifiers:\n", |
2620 | 2622 | " output_dataset = classifier(dataset=output_dataset)\n", |
2621 | 2623 | "\n", |
|
3185 | 3187 | "!mkdir -p {high_quality_output_data_dir}" |
3186 | 3188 | ] |
3187 | 3189 | }, |
| 3190 | + { |
| 3191 | + "cell_type": "markdown", |
| 3192 | + "id": "095c99e5", |
| 3193 | + "metadata": {}, |
| 3194 | + "source": [ |
| 3195 | + "An NVIDIA API token can be obtained from: http://build.nvidia.com/" |
| 3196 | + ] |
| 3197 | + }, |
3188 | 3198 | { |
3189 | 3199 | "cell_type": "code", |
3190 | | - "execution_count": 5, |
| 3200 | + "execution_count": null, |
3191 | 3201 | "id": "aa31956f", |
3192 | 3202 | "metadata": {}, |
3193 | 3203 | "outputs": [], |
3194 | 3204 | "source": [ |
3195 | 3205 | "from openai import OpenAI\n", |
3196 | 3206 | "from transformers import AutoTokenizer\n", |
3197 | 3207 | "\n", |
3198 | | - "openai_api_token = \"<your-openai-api-token>\" # noqa: S105\n", |
| 3208 | + "nvidia_api_token = \"<your-nv-api-token>\" # noqa: S105\n", |
3199 | 3209 | "hf_token = \"<your-huggingface-token>\" # noqa: S105\n", |
3200 | 3210 | "hf_model_name = \"google/gemma-3-1b-it\"\n", |
3201 | 3211 | "api_model_name = \"google/gemma-3-1b-it\"\n", |
3202 | 3212 | "base_url = \"https://integrate.api.nvidia.com/v1\"\n", |
3203 | 3213 | "\n", |
3204 | 3214 | "\n", |
3205 | 3215 | "tokenizer = AutoTokenizer.from_pretrained(hf_model_name, token=hf_token)\n", |
3206 | | - "openai_client = OpenAI(base_url=base_url, api_key=openai_api_token)" |
| 3216 | + "openai_client = OpenAI(base_url=base_url, api_key=nvidia_api_token)" |
3207 | 3217 | ] |
3208 | 3218 | }, |
3209 | 3219 | { |
|
3726 | 3736 | "id": "01b076ee", |
3727 | 3737 | "metadata": {}, |
3728 | 3738 | "source": [ |
3729 | | - "## 5. Visualization" |
| 3739 | + "## 5. [Optional] Visualization" |
3730 | 3740 | ] |
3731 | 3741 | }, |
3732 | 3742 | { |
|
3765 | 3775 | }, |
3766 | 3776 | { |
3767 | 3777 | "cell_type": "code", |
3768 | | - "execution_count": 10, |
| 3778 | + "execution_count": null, |
3769 | 3779 | "id": "b23b66e6", |
3770 | 3780 | "metadata": {}, |
3771 | 3781 | "outputs": [ |
|
4021 | 4031 | "source": [ |
4022 | 4032 | "from viz.text_comparison_widget import compare_row_by_id\n", |
4023 | 4033 | "\n", |
4024 | | - "low_quality_exmaple = \"EN_CC-0000000181\"\n", |
| 4034 | + "low_quality_example = \"<pick-one-low-quality-datapoint>\" # ex: EN_CC-0000000181 (based on our runs)\n", |
4025 | 4035 | "\n", |
4026 | 4036 | "compare_row_by_id(\n", |
4027 | 4037 | " final,\n", |
4028 | | - " row_id=low_quality_exmaple, # Change the row id to see different examples\n", |
| 4038 | + " row_id=low_quality_example, # Change the row id to see different examples\n", |
4029 | 4039 | " id_column=\"id\",\n", |
4030 | 4040 | " col1=\"text\",\n", |
4031 | 4041 | " col2=\"rephrased\",\n", |
|
4038 | 4048 | }, |
4039 | 4049 | { |
4040 | 4050 | "cell_type": "code", |
4041 | | - "execution_count": 16, |
| 4051 | + "execution_count": null, |
4042 | 4052 | "id": "200b87a8", |
4043 | 4053 | "metadata": {}, |
4044 | 4054 | "outputs": [ |
|
4360 | 4370 | "source": [ |
4361 | 4371 | "from viz.text_comparison_widget import compare_row_by_id\n", |
4362 | 4372 | "\n", |
4363 | | - "high_quality_exmaple = \"EN_CC-0000031171\"\n", |
| 4373 | + "high_quality_example = \"<pick-one-high-quality-datapoint>\" # ex: EN_CC-EN_CC-0000031171 (based on our runs)\n", |
4364 | 4374 | "\n", |
4365 | 4375 | "method = \"distill\" # Choose between diverse_qa, distill, extract_knowledge, knowledge_list\n", |
4366 | 4376 | "compare_row_by_id(\n", |
4367 | 4377 | " final,\n", |
4368 | | - " row_id=high_quality_exmaple, # Change the row id to see different examples\n", |
| 4378 | + " row_id=high_quality_example, # Change the row id to see different examples\n", |
4369 | 4379 | " id_column=\"id\",\n", |
4370 | 4380 | " col1=\"text\",\n", |
4371 | 4381 | " col2=method,\n", |
|
0 commit comments