Skip to content

Commit 1061c7a

Browse files
authored
Remaining Nemotron-CC tutorial fixes (#821)
* Remaining Nemotron-CC tutorial fixes Signed-off-by: Sarah Yurick <[email protected]> * uncomment wiki model downloads Signed-off-by: Sarah Yurick <[email protected]> * fix another file path Signed-off-by: Sarah Yurick <[email protected]> * fix wget links for kenlm Signed-off-by: Sarah Yurick <[email protected]> * add pip jsonlines Signed-off-by: Sarah Yurick <[email protected]> * remove unnecessary noqa Signed-off-by: Sarah Yurick <[email protected]> * fix fasttext import Signed-off-by: Sarah Yurick <[email protected]> --------- Signed-off-by: Sarah Yurick <[email protected]>
1 parent 16792b8 commit 1061c7a

File tree

2 files changed

+49
-40
lines changed

2 files changed

+49
-40
lines changed

tutorials/nemotron-cc/fast_text_classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import fasttext
22
import numpy as np
33
import pandas as pd
4-
from fasttext import _FastText
4+
from fasttext.FastText import _FastText
55
from huggingface_hub import hf_hub_download
66

77
from nemo_curator.datasets import DocumentDataset

tutorials/nemotron-cc/nemotron_cc.ipynb

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@
114114
"%env CUDA_VISIBLE_DEVICES 1"
115115
]
116116
},
117+
{
118+
"cell_type": "code",
119+
"execution_count": null,
120+
"id": "ae89d637",
121+
"metadata": {},
122+
"outputs": [],
123+
"source": [
124+
"!pip install jsonlines"
125+
]
126+
},
117127
{
118128
"cell_type": "code",
119129
"execution_count": 2,
@@ -1686,75 +1696,75 @@
16861696
"outputs": [],
16871697
"source": [
16881698
"\"\"\"\n",
1689-
"import multiprocessing as mp #noqa: ERA001\n",
1690-
"import os #noqa: ERA001\n",
1691-
"from collections import defaultdict #noqa: ERA001\n",
1692-
"from typing import Any #noqa: ERA001\n",
1699+
"import multiprocessing as mp\n",
1700+
"import os\n",
1701+
"from collections import defaultdict\n",
1702+
"from typing import Any\n",
16931703
"\n",
1694-
"import numpy as np #noqa: ERA001\n",
1695-
"import pandas as pd #noqa: ERA001\n",
1704+
"import numpy as np\n",
1705+
"import pandas as pd\n",
16961706
"\n",
16971707
"\n",
16981708
"def process_row(args: tuple[int, dict[str, Any]], remove_ex: dict[int, list[tuple[int, int]]]) -> dict[str, Any]:\n",
16991709
" # Process a single row of data\n",
1700-
" idx, row = args #noqa: ERA001\n",
1710+
" idx, row = args\n",
17011711
" new_row = {\n",
1702-
" \"id\": row[\"id\"], #noqa: ERA001\n",
1703-
" \"text\": row[\"text\"], #noqa: ERA001\n",
1712+
" \"id\": row[\"id\"],\n",
1713+
" \"text\": row[\"text\"],\n",
17041714
" # add other columns as needed\n",
17051715
" }\n",
17061716
"\n",
17071717
" if idx in remove_ex:\n",
17081718
" for start, end in remove_ex[idx][::-1]:\n",
1709-
" new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:] #noqa: ERA001\n",
1719+
" new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:]\n",
17101720
"\n",
1711-
" return new_row #noqa: ERA001\n",
1721+
" return new_row\n",
17121722
"\n",
17131723
"\n",
17141724
"def process_dataset(\n",
17151725
" input_path: str, output_path: str, remove_file: str, suffixarray_dir: str, dataset_name: str, split_name: str\n",
17161726
") -> None:\n",
17171727
" # Process the dataset using pandas\n",
17181728
" # Read the input dataset\n",
1719-
" df = pd.read_parquet(input_path) #noqa: ERA001\n",
1729+
" df = pd.read_parquet(input_path)\n",
17201730
"\n",
17211731
" # Read removal information\n",
1722-
" remove = [] #noqa: ERA01\n",
1732+
" remove = []\n",
17231733
" with open(remove_file) as fin:\n",
17241734
" for line in fin:\n",
17251735
" if \"out\" in line:\n",
1726-
" break #noqa: ERA001\n",
1736+
" break\n",
17271737
" for line in fin:\n",
1728-
" remove.append(list(map(int, line.split()))) #noqa: ERA001\n",
1738+
" remove.append(list(map(int, line.split())))\n",
17291739
"\n",
17301740
" # Read size information\n",
1731-
" size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\") #noqa: ERA001\n",
1732-
" sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64) # noqa: ERA001\n",
1741+
" size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\")\n",
1742+
" sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64)\n",
17331743
"\n",
17341744
" # Process removal information\n",
1735-
" remove_ex = defaultdict(list) #noqa: ERA001\n",
1736-
" ptr = 0 #noqa: ERA001\n",
1745+
" remove_ex = defaultdict(list)\n",
1746+
" ptr = 0\n",
17371747
" for i, byte_start in enumerate(sizes[:-1]):\n",
1738-
" byte_end = sizes[i + 1] #noqa: ERA001\n",
1748+
" byte_end = sizes[i + 1]\n",
17391749
" while ptr < len(remove) and byte_start <= remove[ptr][0] < byte_end:\n",
1740-
" assert remove[ptr][1] < byte_end + 6 # noqa: ERA001\n",
1750+
" assert remove[ptr][1] < byte_end + 6\n",
17411751
" remove_ex[i].append(\n",
17421752
" (\n",
1743-
" max(int(remove[ptr][0] - byte_start - 6), 0), #noqa: ERA001\n",
1744-
" min(int(remove[ptr][1] - byte_start), byte_end - byte_start), #noqa: ERA001\n",
1753+
" max(int(remove[ptr][0] - byte_start - 6), 0),\n",
1754+
" min(int(remove[ptr][1] - byte_start), byte_end - byte_start),\n",
17451755
" )\n",
17461756
" )\n",
1747-
" ptr += 1 #noqa: ERA001\n",
1757+
" ptr += 1\n",
17481758
"\n",
17491759
" # Process the dataset in parallel\n",
17501760
" with mp.Pool(mp.cpu_count()) as pool:\n",
1751-
" processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex)) #noqa: ERA001\n",
1761+
" processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex))\n",
17521762
"\n",
17531763
" # Create new dataframe with processed rows\n",
1754-
" processed_df = pd.DataFrame(processed_rows) #noqa: ERA001\n",
1764+
" processed_df = pd.DataFrame(processed_rows)\n",
17551765
"\n",
17561766
" # Save processed dataset\n",
1757-
" processed_df.to_parquet(output_path) #noqa: ERA001\n",
1767+
" processed_df.to_parquet(output_path)\n",
17581768
"\n",
17591769
"\n",
17601770
"# Example usage\n",
@@ -2136,7 +2146,7 @@
21362146
"id": "d03de3ab",
21372147
"metadata": {},
21382148
"source": [
2139-
"Now, let's perform perplexity filtering using a KenLM model trained on wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [HuggingFace](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold."
2149+
"Now, let's perform perplexity filtering using a KenLM model trained on Wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [Hugging Face](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold."
21402150
]
21412151
},
21422152
{
@@ -2152,17 +2162,18 @@
21522162
},
21532163
{
21542164
"cell_type": "code",
2155-
"execution_count": 21,
2165+
"execution_count": null,
21562166
"id": "bf2829cc",
21572167
"metadata": {},
21582168
"outputs": [],
21592169
"source": [
21602170
"models_dir = os.path.join(cur_dir, \"models\", \"wikipedia\")\n",
21612171
"os.makedirs(models_dir, exist_ok=True)\n",
2172+
"\n",
21622173
"# Download KenLM pre-trained models\n",
2163-
"# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.arpa.bin -P {models_dir}\n",
2164-
"# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.model -P {models_dir}\n",
2165-
"# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.vocab -P {models_dir}"
2174+
"!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.arpa.bin -P {models_dir}\n",
2175+
"!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.model -P {models_dir}\n",
2176+
"!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.vocab -P {models_dir}"
21662177
]
21672178
},
21682179
{
@@ -2202,13 +2213,13 @@
22022213
},
22032214
{
22042215
"cell_type": "code",
2205-
"execution_count": 23,
2216+
"execution_count": null,
22062217
"id": "b21f0720",
22072218
"metadata": {},
22082219
"outputs": [],
22092220
"source": [
22102221
"# Input\n",
2211-
"PF_input_data_dir = os.path.join(\"/workspace/nemotron-cc/data/heuristic_filtering/data/hq.parquet\", \"result.parquet\")\n",
2222+
"PF_input_data_dir = kept_document_dir\n",
22122223
"input_file_type = \"parquet\"\n",
22132224
"batch_size = 1\n",
22142225
"\n",
@@ -2232,7 +2243,7 @@
22322243
},
22332244
{
22342245
"cell_type": "code",
2235-
"execution_count": 61,
2246+
"execution_count": null,
22362247
"id": "4a844995",
22372248
"metadata": {},
22382249
"outputs": [
@@ -2274,7 +2285,7 @@
22742285
")\n",
22752286
"\n",
22762287
"perplexity_filter = ScoreFilter(\n",
2277-
" PerplexityFilter(threshold=100000.00),\n",
2288+
" PerplexityFilter(threshold=threshold),\n",
22782289
" text_field=\"text\",\n",
22792290
" score_field=\"perplexity_score\",\n",
22802291
")\n",
@@ -2567,9 +2578,7 @@
25672578
"metadata": {},
25682579
"outputs": [],
25692580
"source": [
2570-
"input_dataset = DocumentDataset.read_parquet(\n",
2571-
" \"/workspace/nemotron-cc/data/perplexity_filtering/data/hq.parquet/result.parquet\", backend=\"cudf\"\n",
2572-
")"
2581+
"input_dataset = DocumentDataset.read_parquet(kept_document_dir, backend=\"cudf\")"
25732582
]
25742583
},
25752584
{

0 commit comments

Comments
 (0)