@@ -24,7 +24,12 @@ def test_map_bit_fingerprint(smallest_smiles_list, smallest_mols_list):
2424
2525
2626def test_map_count_fingerprint (smallest_smiles_list , smallest_mols_list ):
27- map_fp = MAPFingerprint (verbose = 0 , n_jobs = - 1 )
27+ map_fp = MAPFingerprint (
28+ variant = "count" ,
29+ include_duplicated_shingles = True ,
30+ verbose = 0 ,
31+ n_jobs = - 1 ,
32+ )
2833 X_skfp = map_fp .transform (smallest_smiles_list )
2934
3035 X_map = np .stack (
@@ -33,12 +38,12 @@ def test_map_count_fingerprint(smallest_smiles_list, smallest_mols_list):
3338
3439 assert_equal (X_skfp , X_map )
3540 assert_equal (X_skfp .shape , (len (smallest_smiles_list ), map_fp .fp_size ))
36- assert X_skfp .dtype == np .uint8
41+ assert X_skfp .dtype == np .uint32
3742 assert np .all (X_skfp >= 0 )
3843
3944
4045def test_map_raw_hashes_fingerprint (smallest_smiles_list , smallest_mols_list ):
41- map_fp = MAPFingerprint (n_jobs = - 1 )
46+ map_fp = MAPFingerprint (variant = "minhash" , n_jobs = - 1 , random_state = 0 )
4247 X_skfp = map_fp .transform (smallest_smiles_list )
4348
4449 X_map = np .stack (
@@ -52,7 +57,7 @@ def test_map_raw_hashes_fingerprint(smallest_smiles_list, smallest_mols_list):
5257
5358
5459def test_map_sparse_bit_fingerprint (smallest_smiles_list , smallest_mols_list ):
55- map_fp = MAPFingerprint (sparse = True , n_jobs = - 1 )
60+ map_fp = MAPFingerprint (variant = "binary" , sparse = True , n_jobs = - 1 )
5661 X_skfp = map_fp .transform (smallest_smiles_list )
5762
5863 X_map = csr_array (
@@ -69,7 +74,12 @@ def test_map_sparse_bit_fingerprint(smallest_smiles_list, smallest_mols_list):
6974
7075
7176def test_map_sparse_count_fingerprint (smallest_smiles_list , smallest_mols_list ):
72- map_fp = MAPFingerprint (include_duplicated_shingles = True , sparse = True , n_jobs = - 1 )
77+ map_fp = MAPFingerprint (
78+ variant = "count" ,
79+ include_duplicated_shingles = True ,
80+ sparse = True ,
81+ n_jobs = - 1 ,
82+ )
7383 X_skfp = map_fp .transform (smallest_smiles_list )
7484
7585 X_map = csr_array (
@@ -78,38 +88,114 @@ def test_map_sparse_count_fingerprint(smallest_smiles_list, smallest_mols_list):
7888
7989 assert_equal (X_skfp .data , X_map .data )
8090 assert_equal (X_skfp .shape , (len (smallest_smiles_list ), map_fp .fp_size ))
81- assert X_skfp .dtype == np .uint8
91+ assert X_skfp .dtype == np .uint32
8292 assert np .all (X_skfp .data > 0 )
8393
84- map_fp = MAPFingerprint (
85- include_duplicated_shingles = True , sparse = True , count = True , n_jobs = - 1
86- )
94+
95+ def test_map_sparse_raw_hashes_fingerprint ( smallest_smiles_list , smallest_mols_list ):
96+ map_fp = MAPFingerprint ( sparse = True , n_jobs = - 1 )
8797 X_skfp = map_fp .transform (smallest_smiles_list )
8898
8999 X_map = csr_array (
90100 [map_fp ._calculate_single_mol_fingerprint (mol ) for mol in smallest_mols_list ],
101+ dtype = int ,
91102 )
92103
93104 assert_equal (X_skfp .data , X_map .data )
94105 assert_equal (X_skfp .shape , (len (smallest_smiles_list ), map_fp .fp_size ))
95- assert X_skfp .dtype == np .uint32
96- assert np .all (X_skfp .data > 0 )
106+ assert np .issubdtype (X_skfp .dtype , np .integer )
97107
98108
99- def test_map_sparse_raw_hashes_fingerprint (smallest_smiles_list , smallest_mols_list ):
100- map_fp = MAPFingerprint (sparse = True , n_jobs = - 1 )
109+ def test_map_sparse_minhash_fingerprint (smallest_smiles_list , smallest_mols_list ):
110+ map_fp = MAPFingerprint (
111+ variant = "minhash" ,
112+ sparse = True ,
113+ n_jobs = - 1 ,
114+ random_state = 0 ,
115+ )
101116 X_skfp = map_fp .transform (smallest_smiles_list )
102117
103118 X_map = csr_array (
104119 [map_fp ._calculate_single_mol_fingerprint (mol ) for mol in smallest_mols_list ],
105- dtype = int ,
120+ dtype = np . uint32 ,
106121 )
107122
108123 assert_equal (X_skfp .data , X_map .data )
109124 assert_equal (X_skfp .shape , (len (smallest_smiles_list ), map_fp .fp_size ))
125+ assert X_skfp .dtype == np .uint32
110126 assert np .issubdtype (X_skfp .dtype , np .integer )
111127
112128
129+ def test_map_minhash_same_random_state_is_reproducible (smallest_smiles_list ):
130+ map_fp_1 = MAPFingerprint (variant = "minhash" , random_state = 123 , n_jobs = - 1 )
131+ map_fp_2 = MAPFingerprint (variant = "minhash" , random_state = 123 , n_jobs = - 1 )
132+
133+ X_1 = map_fp_1 .transform (smallest_smiles_list )
134+ X_2 = map_fp_2 .transform (smallest_smiles_list )
135+
136+ assert_equal (X_1 , X_2 )
137+
138+
139+ def test_map_minhash_different_random_state_changes_output (smallest_smiles_list ):
140+ map_fp_1 = MAPFingerprint (variant = "minhash" , random_state = 123 , n_jobs = - 1 )
141+ map_fp_2 = MAPFingerprint (variant = "minhash" , random_state = 456 , n_jobs = - 1 )
142+
143+ X_1 = map_fp_1 .transform (smallest_smiles_list )
144+ X_2 = map_fp_2 .transform (smallest_smiles_list )
145+
146+ assert not np .array_equal (X_1 , X_2 )
147+
148+
149+ def test_map_minhash_is_independent_of_input_order_and_batch_size ():
150+ smiles = [
151+ "CC(=O)Oc1ccccc1C(=O)O" ,
152+ "CCO" ,
153+ "c1ccccc1" ,
154+ "CCN(CC)CC" ,
155+ ]
156+
157+ map_fp = MAPFingerprint (variant = "minhash" , random_state = 123 , n_jobs = - 1 )
158+
159+ X_full = map_fp .transform (smiles )
160+
161+ # same molecules, different order
162+ reordered_indices = [2 , 0 , 3 , 1 ]
163+ reordered_smiles = [smiles [i ] for i in reordered_indices ]
164+ X_reordered = map_fp .transform (reordered_smiles )
165+
166+ # compare molecule-by-molecule, not row-by-row
167+ for original_idx , reordered_idx in enumerate (reordered_indices ):
168+ assert_equal (X_full [reordered_idx ], X_reordered [original_idx ])
169+
170+ # same molecules, smaller subsets / singleton calls
171+ for i , smi in enumerate (smiles ):
172+ X_single = map_fp .transform ([smi ])
173+ assert_equal (X_full [i ], X_single [0 ])
174+
175+ X_subset = map_fp .transform (smiles [:2 ])
176+ assert_equal (X_full [:2 ], X_subset )
177+
178+
179+ def test_map_binary_ignores_random_state (smallest_smiles_list ):
180+ map_fp_1 = MAPFingerprint (variant = "binary" , random_state = 123 , n_jobs = - 1 )
181+ map_fp_2 = MAPFingerprint (variant = "binary" , random_state = 456 , n_jobs = - 1 )
182+
183+ X_1 = map_fp_1 .transform (smallest_smiles_list )
184+ X_2 = map_fp_2 .transform (smallest_smiles_list )
185+
186+ assert_equal (X_1 , X_2 )
187+
188+
189+ def test_map_count_ignores_random_state (smallest_smiles_list ):
190+ map_fp_1 = MAPFingerprint (variant = "count" , random_state = 123 , n_jobs = - 1 )
191+ map_fp_2 = MAPFingerprint (variant = "count" , random_state = 456 , n_jobs = - 1 )
192+
193+ X_1 = map_fp_1 .transform (smallest_smiles_list )
194+ X_2 = map_fp_2 .transform (smallest_smiles_list )
195+
196+ assert_equal (X_1 , X_2 )
197+
198+
113199def test_map_chirality (smallest_mols_list ):
114200 # smoke test, this should not throw an error
115201 map_fp = MAPFingerprint (include_chirality = True , n_jobs = - 1 )
0 commit comments