|
114 | 114 | "%env CUDA_VISIBLE_DEVICES 1" |
115 | 115 | ] |
116 | 116 | }, |
| 117 | + { |
| 118 | + "cell_type": "code", |
| 119 | + "execution_count": null, |
| 120 | + "id": "ae89d637", |
| 121 | + "metadata": {}, |
| 122 | + "outputs": [], |
| 123 | + "source": [ |
| 124 | + "!pip install jsonlines" |
| 125 | + ] |
| 126 | + }, |
117 | 127 | { |
118 | 128 | "cell_type": "code", |
119 | 129 | "execution_count": 2, |
|
1686 | 1696 | "outputs": [], |
1687 | 1697 | "source": [ |
1688 | 1698 | "\"\"\"\n", |
1689 | | - "import multiprocessing as mp #noqa: ERA001\n", |
1690 | | - "import os #noqa: ERA001\n", |
1691 | | - "from collections import defaultdict #noqa: ERA001\n", |
1692 | | - "from typing import Any #noqa: ERA001\n", |
| 1699 | + "import multiprocessing as mp\n", |
| 1700 | + "import os\n", |
| 1701 | + "from collections import defaultdict\n", |
| 1702 | + "from typing import Any\n", |
1693 | 1703 | "\n", |
1694 | | - "import numpy as np #noqa: ERA001\n", |
1695 | | - "import pandas as pd #noqa: ERA001\n", |
| 1704 | + "import numpy as np\n", |
| 1705 | + "import pandas as pd\n", |
1696 | 1706 | "\n", |
1697 | 1707 | "\n", |
1698 | 1708 | "def process_row(args: tuple[int, dict[str, Any]], remove_ex: dict[int, list[tuple[int, int]]]) -> dict[str, Any]:\n", |
1699 | 1709 | " # Process a single row of data\n", |
1700 | | - " idx, row = args #noqa: ERA001\n", |
| 1710 | + " idx, row = args\n", |
1701 | 1711 | " new_row = {\n", |
1702 | | - " \"id\": row[\"id\"], #noqa: ERA001\n", |
1703 | | - " \"text\": row[\"text\"], #noqa: ERA001\n", |
| 1712 | + " \"id\": row[\"id\"],\n", |
| 1713 | + " \"text\": row[\"text\"],\n", |
1704 | 1714 | " # add other columns as needed\n", |
1705 | 1715 | " }\n", |
1706 | 1716 | "\n", |
1707 | 1717 | " if idx in remove_ex:\n", |
1708 | 1718 | " for start, end in remove_ex[idx][::-1]:\n", |
1709 | | - " new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:] #noqa: ERA001\n", |
| 1719 | + " new_row[\"text\"] = new_row[\"text\"][:start] + new_row[\"text\"][end:]\n", |
1710 | 1720 | "\n", |
1711 | | - " return new_row #noqa: ERA001\n", |
| 1721 | + " return new_row\n", |
1712 | 1722 | "\n", |
1713 | 1723 | "\n", |
1714 | 1724 | "def process_dataset(\n", |
1715 | 1725 | " input_path: str, output_path: str, remove_file: str, suffixarray_dir: str, dataset_name: str, split_name: str\n", |
1716 | 1726 | ") -> None:\n", |
1717 | 1727 | " # Process the dataset using pandas\n", |
1718 | 1728 | " # Read the input dataset\n", |
1719 | | - " df = pd.read_parquet(input_path) #noqa: ERA001\n", |
| 1729 | + " df = pd.read_parquet(input_path)\n", |
1720 | 1730 | "\n", |
1721 | 1731 | " # Read removal information\n", |
1722 | | - " remove = [] #noqa: ERA01\n", |
| 1732 | + " remove = []\n", |
1723 | 1733 | " with open(remove_file) as fin:\n", |
1724 | 1734 | " for line in fin:\n", |
1725 | 1735 | " if \"out\" in line:\n", |
1726 | | - " break #noqa: ERA001\n", |
| 1736 | + " break\n", |
1727 | 1737 | " for line in fin:\n", |
1728 | | - " remove.append(list(map(int, line.split()))) #noqa: ERA001\n", |
| 1738 | + " remove.append(list(map(int, line.split())))\n", |
1729 | 1739 | "\n", |
1730 | 1740 | " # Read size information\n", |
1731 | | - " size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\") #noqa: ERA001\n", |
1732 | | - " sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64) # noqa: ERA001\n", |
| 1741 | + " size_file = os.path.join(suffixarray_dir, f\"{dataset_name}.{split_name}.size\")\n", |
| 1742 | + " sizes = np.frombuffer(open(size_file, \"rb\").read(), dtype=np.uint64)\n", |
1733 | 1743 | "\n", |
1734 | 1744 | " # Process removal information\n", |
1735 | | - " remove_ex = defaultdict(list) #noqa: ERA001\n", |
1736 | | - " ptr = 0 #noqa: ERA001\n", |
| 1745 | + " remove_ex = defaultdict(list)\n", |
| 1746 | + " ptr = 0\n", |
1737 | 1747 | " for i, byte_start in enumerate(sizes[:-1]):\n", |
1738 | | - " byte_end = sizes[i + 1] #noqa: ERA001\n", |
| 1748 | + " byte_end = sizes[i + 1]\n", |
1739 | 1749 | " while ptr < len(remove) and byte_start <= remove[ptr][0] < byte_end:\n", |
1740 | | - " assert remove[ptr][1] < byte_end + 6 # noqa: ERA001\n", |
| 1750 | + " assert remove[ptr][1] < byte_end + 6\n", |
1741 | 1751 | " remove_ex[i].append(\n", |
1742 | 1752 | " (\n", |
1743 | | - " max(int(remove[ptr][0] - byte_start - 6), 0), #noqa: ERA001\n", |
1744 | | - " min(int(remove[ptr][1] - byte_start), byte_end - byte_start), #noqa: ERA001\n", |
| 1753 | + " max(int(remove[ptr][0] - byte_start - 6), 0),\n", |
| 1754 | + " min(int(remove[ptr][1] - byte_start), byte_end - byte_start),\n", |
1745 | 1755 | " )\n", |
1746 | 1756 | " )\n", |
1747 | | - " ptr += 1 #noqa: ERA001\n", |
| 1757 | + " ptr += 1\n", |
1748 | 1758 | "\n", |
1749 | 1759 | " # Process the dataset in parallel\n", |
1750 | 1760 | " with mp.Pool(mp.cpu_count()) as pool:\n", |
1751 | | - " processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex)) #noqa: ERA001\n", |
| 1761 | + " processed_rows = pool.map(process_row, enumerate(df[\"text\"], remove_ex))\n", |
1752 | 1762 | "\n", |
1753 | 1763 | " # Create new dataframe with processed rows\n", |
1754 | | - " processed_df = pd.DataFrame(processed_rows) #noqa: ERA001\n", |
| 1764 | + " processed_df = pd.DataFrame(processed_rows)\n", |
1755 | 1765 | "\n", |
1756 | 1766 | " # Save processed dataset\n", |
1757 | | - " processed_df.to_parquet(output_path) #noqa: ERA001\n", |
| 1767 | + " processed_df.to_parquet(output_path)\n", |
1758 | 1768 | "\n", |
1759 | 1769 | "\n", |
1760 | 1770 | "# Example usage\n", |
|
2136 | 2146 | "id": "d03de3ab", |
2137 | 2147 | "metadata": {}, |
2138 | 2148 | "source": [ |
2139 | | - "Now, let's perform perplexity filtering using a KenLM model trained on wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [HuggingFace](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold." |
| 2149 | + "Now, let's perform perplexity filtering using a KenLM model trained on Wikipedia data. NeMo Curator does not support KenLM filtering out of the box, instead we will use pre-trained KenLM models hosted on [Hugging Face](https://huggingface.co/edugp/kenlm/tree/main/wikipedia) to generate perplexity scores for every document and filter based on a threshold." |
2140 | 2150 | ] |
2141 | 2151 | }, |
2142 | 2152 | { |
|
2152 | 2162 | }, |
2153 | 2163 | { |
2154 | 2164 | "cell_type": "code", |
2155 | | - "execution_count": 21, |
| 2165 | + "execution_count": null, |
2156 | 2166 | "id": "bf2829cc", |
2157 | 2167 | "metadata": {}, |
2158 | 2168 | "outputs": [], |
2159 | 2169 | "source": [ |
2160 | 2170 | "models_dir = os.path.join(cur_dir, \"models\", \"wikipedia\")\n", |
2161 | 2171 | "os.makedirs(models_dir, exist_ok=True)\n", |
| 2172 | + "\n", |
2162 | 2173 | "# Download KenLM pre-trained models\n", |
2163 | | - "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.arpa.bin -P {models_dir}\n", |
2164 | | - "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.model -P {models_dir}\n", |
2165 | | - "# !wget https://huggingface.co/edugp/kenlm/blob/main/wikipedia/en.sp.vocab -P {models_dir}" |
| 2174 | + "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.arpa.bin -P {models_dir}\n", |
| 2175 | + "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.model -P {models_dir}\n", |
| 2176 | + "!wget https://huggingface.co/edugp/kenlm/resolve/main/wikipedia/en.sp.vocab -P {models_dir}" |
2166 | 2177 | ] |
2167 | 2178 | }, |
2168 | 2179 | { |
|
2202 | 2213 | }, |
2203 | 2214 | { |
2204 | 2215 | "cell_type": "code", |
2205 | | - "execution_count": 23, |
| 2216 | + "execution_count": null, |
2206 | 2217 | "id": "b21f0720", |
2207 | 2218 | "metadata": {}, |
2208 | 2219 | "outputs": [], |
2209 | 2220 | "source": [ |
2210 | 2221 | "# Input\n", |
2211 | | - "PF_input_data_dir = os.path.join(\"/workspace/nemotron-cc/data/heuristic_filtering/data/hq.parquet\", \"result.parquet\")\n", |
| 2222 | + "PF_input_data_dir = kept_document_dir\n", |
2212 | 2223 | "input_file_type = \"parquet\"\n", |
2213 | 2224 | "batch_size = 1\n", |
2214 | 2225 | "\n", |
|
2232 | 2243 | }, |
2233 | 2244 | { |
2234 | 2245 | "cell_type": "code", |
2235 | | - "execution_count": 61, |
| 2246 | + "execution_count": null, |
2236 | 2247 | "id": "4a844995", |
2237 | 2248 | "metadata": {}, |
2238 | 2249 | "outputs": [ |
|
2274 | 2285 | ")\n", |
2275 | 2286 | "\n", |
2276 | 2287 | "perplexity_filter = ScoreFilter(\n", |
2277 | | - " PerplexityFilter(threshold=100000.00),\n", |
| 2288 | + " PerplexityFilter(threshold=threshold),\n", |
2278 | 2289 | " text_field=\"text\",\n", |
2279 | 2290 | " score_field=\"perplexity_score\",\n", |
2280 | 2291 | ")\n", |
|
2567 | 2578 | "metadata": {}, |
2568 | 2579 | "outputs": [], |
2569 | 2580 | "source": [ |
2570 | | - "input_dataset = DocumentDataset.read_parquet(\n", |
2571 | | - " \"/workspace/nemotron-cc/data/perplexity_filtering/data/hq.parquet/result.parquet\", backend=\"cudf\"\n", |
2572 | | - ")" |
| 2581 | + "input_dataset = DocumentDataset.read_parquet(kept_document_dir, backend=\"cudf\")" |
2573 | 2582 | ] |
2574 | 2583 | }, |
2575 | 2584 | { |
|
0 commit comments