2727import dask_cudf
2828import numpy as np
2929
30- from nemo_curator ._compat import MINHASH_DEPRECATED_API , MINHASH_PERMUTED_AVAILABLE
3130from nemo_curator .datasets import DocumentDataset
3231from nemo_curator .log import create_logger
3332from nemo_curator .utils .distributed_utils import performance_report_if_with_ts_suffix
@@ -71,14 +70,11 @@ def __init__( # noqa: PLR0913
7170 self .num_hashes = num_hashes
7271 self .char_ngram = char_ngrams
7372
74- if MINHASH_DEPRECATED_API :
75- self .seeds = self .generate_seeds (n_seeds = self .num_hashes , seed = seed )
76- else :
77- self .seeds = self .generate_hash_permutation_seeds (
78- bit_width = 64 if use_64bit_hash else 32 ,
79- n_permutations = self .num_hashes ,
80- seed = seed ,
81- )
73+ self .seeds = self .generate_hash_permutation_seeds (
74+ bit_width = 64 if use_64bit_hash else 32 ,
75+ n_permutations = self .num_hashes ,
76+ seed = seed ,
77+ )
8278
8379 self .minhash_method = self .minhash64 if use_64bit_hash else self .minhash32
8480 self .id_field = id_field
@@ -98,13 +94,6 @@ def __init__( # noqa: PLR0913
9894 else :
9995 self ._logger = logger
10096
101- def generate_seeds (self , n_seeds : int = 260 , seed : int = 0 ) -> np .ndarray :
102- """
103- Generate seeds for all minhash permutations based on the given seed.
104- """
105- gen = np .random .RandomState (seed )
106- return gen .randint (0 , 1e6 , size = n_seeds )
107-
10897 def generate_hash_permutation_seeds (self , bit_width : int , n_permutations : int = 260 , seed : int = 0 ) -> np .ndarray :
10998 """
11099 Generate seeds for all minhash permutations based on the given seed.
@@ -141,24 +130,10 @@ def minhash32(self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int) -> cud
141130 msg = "Expected data of type cudf.Series"
142131 raise TypeError (msg )
143132
144- if MINHASH_DEPRECATED_API :
145- warnings .warn (
146- "Using an outdated minhash implementation, please update to cuDF version 24.12 "
147- "or later for improved performance. "
148- "Install the latest version of cuDF using `pip install curator[cuda12x_nightly]`" ,
149- category = FutureWarning ,
150- stacklevel = 2 ,
151- )
152- seeds = cudf .Series (seeds , dtype = "uint32" )
153- return ser .str .minhash (seeds = seeds , width = char_ngram )
154- else :
155- seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint32" )
156- seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint32" )
133+ seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint32" )
134+ seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint32" )
157135
158- if MINHASH_PERMUTED_AVAILABLE :
159- return ser .str .minhash_permuted (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
160- else :
161- return ser .str .minhash (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
136+ return ser .str .minhash (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
162137
163138 def minhash64 (self , ser : cudf .Series , seeds : np .ndarray , char_ngram : int ) -> cudf .Series :
164139 """
@@ -167,24 +142,9 @@ def minhash64(self, ser: cudf.Series, seeds: np.ndarray, char_ngram: int) -> cud
167142 if not isinstance (ser , cudf .Series ):
168143 msg = "Expected data of type cudf.Series"
169144 raise TypeError (msg )
170- if MINHASH_DEPRECATED_API :
171- warnings .warn (
172- "Using an outdated minhash implementation, please update to cuDF version 24.12 "
173- "or later for improved performance. "
174- "Install the latest version of cuDF using `pip install curator[cuda12x_nightly]`" ,
175- category = FutureWarning ,
176- stacklevel = 2 ,
177- )
178- seeds = cudf .Series (seeds , dtype = "uint64" )
179- return ser .str .minhash64 (seeds = seeds , width = char_ngram )
180- else :
181- seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint64" )
182- seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint64" )
183-
184- if MINHASH_PERMUTED_AVAILABLE :
185- return ser .str .minhash64_permuted (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
186- else :
187- return ser .str .minhash64 (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
145+ seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint64" )
146+ seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint64" )
147+ return ser .str .minhash64 (a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram )
188148
189149 def __call__ (self , dataset : DocumentDataset ) -> str | DocumentDataset :
190150 """
0 commit comments