|
30 | 30 | CountVectorizerModel,
|
31 | 31 | OneHotEncoder,
|
32 | 32 | OneHotEncoderModel,
|
| 33 | + FeatureHasher, |
33 | 34 | HashingTF,
|
34 | 35 | IDF,
|
| 36 | + IDFModel, |
35 | 37 | NGram,
|
36 | 38 | RFormula,
|
37 | 39 | Tokenizer,
|
|
66 | 68 | from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
|
67 | 69 | from pyspark.sql import Row
|
68 | 70 | from pyspark.testing.utils import QuietTest
|
69 |
| -from pyspark.testing.mlutils import check_params, SparkSessionTestCase |
| 71 | +from pyspark.testing.mlutils import SparkSessionTestCase |
70 | 72 |
|
71 | 73 |
|
72 | 74 | class FeatureTestsMixin:
|
@@ -842,22 +844,41 @@ def test_bucketizer(self):
|
842 | 844 | self.assertEqual(str(bucketizer), str(bucketizer2))
|
843 | 845 |
|
844 | 846 | def test_idf(self):
|
845 |
| - dataset = self.spark.createDataFrame( |
846 |
| - [(DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)], |
| 847 | + df = self.spark.createDataFrame( |
| 848 | + [ |
| 849 | + (DenseVector([1.0, 2.0]),), |
| 850 | + (DenseVector([0.0, 1.0]),), |
| 851 | + (DenseVector([3.0, 0.2]),), |
| 852 | + ], |
847 | 853 | ["tf"],
|
848 | 854 | )
|
849 |
| - idf0 = IDF(inputCol="tf") |
850 |
| - self.assertListEqual(idf0.params, [idf0.inputCol, idf0.minDocFreq, idf0.outputCol]) |
851 |
| - idf0m = idf0.fit(dataset, {idf0.outputCol: "idf"}) |
852 |
| - self.assertEqual( |
853 |
| - idf0m.uid, idf0.uid, "Model should inherit the UID from its parent estimator." |
| 855 | + idf = IDF(inputCol="tf") |
| 856 | + self.assertListEqual(idf.params, [idf.inputCol, idf.minDocFreq, idf.outputCol]) |
| 857 | + |
| 858 | + model = idf.fit(df, {idf.outputCol: "idf"}) |
| 859 | + # self.assertEqual( |
| 860 | + # model.uid, idf.uid, "Model should inherit the UID from its parent estimator." |
| 861 | + # ) |
| 862 | + self.assertTrue( |
| 863 | + np.allclose(model.idf.toArray(), [0.28768207245178085, 0.0], atol=1e-4), |
| 864 | + model.idf, |
854 | 865 | )
|
855 |
| - output = idf0m.transform(dataset) |
| 866 | + self.assertEqual(model.docFreq, [2, 3]) |
| 867 | + self.assertEqual(model.numDocs, 3) |
| 868 | + |
| 869 | + output = model.transform(df) |
| 870 | + self.assertEqual(output.columns, ["tf", "idf"]) |
856 | 871 | self.assertIsNotNone(output.head().idf)
|
857 |
| - self.assertIsNotNone(idf0m.docFreq) |
858 |
| - self.assertEqual(idf0m.numDocs, 3) |
859 |
| - # Test that parameters transferred to Python Model |
860 |
| - check_params(self, idf0m) |
| 872 | + |
| 873 | + # save & load |
| 874 | + with tempfile.TemporaryDirectory(prefix="idf") as d: |
| 875 | + idf.write().overwrite().save(d) |
| 876 | + idf2 = IDF.load(d) |
| 877 | + self.assertEqual(str(idf), str(idf2)) |
| 878 | + |
| 879 | + model.write().overwrite().save(d) |
| 880 | + model2 = IDFModel.load(d) |
| 881 | + self.assertEqual(str(model), str(model2)) |
861 | 882 |
|
862 | 883 | def test_ngram(self):
|
863 | 884 | dataset = self.spark.createDataFrame([Row(input=["a", "b", "c", "d", "e"])])
|
@@ -1149,26 +1170,63 @@ def test_vector_size_hint(self):
|
1149 | 1170 | expected = DenseVector([0.0, 10.0, 0.5])
|
1150 | 1171 | self.assertEqual(output, expected)
|
1151 | 1172 |
|
1152 |
| - def test_apply_binary_term_freqs(self): |
| 1173 | + def test_feature_hasher(self): |
| 1174 | + data = [(2.0, True, "1", "foo"), (3.0, False, "2", "bar")] |
| 1175 | + cols = ["real", "bool", "stringNum", "string"] |
| 1176 | + df = self.spark.createDataFrame(data, cols) |
| 1177 | + |
| 1178 | + hasher = FeatureHasher(numFeatures=2) |
| 1179 | + hasher.setInputCols(cols) |
| 1180 | + hasher.setOutputCol("features") |
| 1181 | + |
| 1182 | + self.assertEqual(hasher.getNumFeatures(), 2) |
| 1183 | + self.assertEqual(hasher.getInputCols(), cols) |
| 1184 | + self.assertEqual(hasher.getOutputCol(), "features") |
| 1185 | + |
| 1186 | + output = hasher.transform(df) |
| 1187 | + self.assertEqual(output.columns, ["real", "bool", "stringNum", "string", "features"]) |
| 1188 | + self.assertEqual(output.count(), 2) |
| 1189 | + |
| 1190 | + features = output.head().features.toArray() |
| 1191 | + self.assertTrue( |
| 1192 | + np.allclose(features, [2.0, 3.0], atol=1e-4), |
| 1193 | + features, |
| 1194 | + ) |
| 1195 | + |
| 1196 | + # save & load |
| 1197 | + with tempfile.TemporaryDirectory(prefix="feature_hasher") as d: |
| 1198 | + hasher.write().overwrite().save(d) |
| 1199 | + hasher2 = FeatureHasher.load(d) |
| 1200 | + self.assertEqual(str(hasher), str(hasher2)) |
| 1201 | + |
| 1202 | + def test_hashing_tf(self): |
1153 | 1203 | df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
|
1154 |
| - n = 10 |
1155 |
| - hashingTF = HashingTF() |
1156 |
| - hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True) |
1157 |
| - output = hashingTF.transform(df) |
| 1204 | + tf = HashingTF() |
| 1205 | + tf.setInputCol("words").setOutputCol("features").setNumFeatures(10).setBinary(True) |
| 1206 | + self.assertEqual(tf.getInputCol(), "words") |
| 1207 | + self.assertEqual(tf.getOutputCol(), "features") |
| 1208 | + self.assertEqual(tf.getNumFeatures(), 10) |
| 1209 | + self.assertTrue(tf.getBinary()) |
| 1210 | + |
| 1211 | + output = tf.transform(df) |
| 1212 | + self.assertEqual(output.columns, ["id", "words", "features"]) |
| 1213 | + self.assertEqual(output.count(), 1) |
| 1214 | + |
1158 | 1215 | features = output.select("features").first().features.toArray()
|
1159 |
| - expected = Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]).toArray() |
1160 |
| - for i in range(0, n): |
1161 |
| - self.assertAlmostEqual( |
1162 |
| - features[i], |
1163 |
| - expected[i], |
1164 |
| - 14, |
1165 |
| - "Error at " |
1166 |
| - + str(i) |
1167 |
| - + ": expected " |
1168 |
| - + str(expected[i]) |
1169 |
| - + ", got " |
1170 |
| - + str(features[i]), |
1171 |
| - ) |
| 1216 | + self.assertTrue( |
| 1217 | + np.allclose( |
| 1218 | + features, |
| 1219 | + [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0], |
| 1220 | + atol=1e-4, |
| 1221 | + ), |
| 1222 | + features, |
| 1223 | + ) |
| 1224 | + |
| 1225 | + # save & load |
| 1226 | + with tempfile.TemporaryDirectory(prefix="hashing_tf") as d: |
| 1227 | + tf.write().overwrite().save(d) |
| 1228 | + tf2 = HashingTF.load(d) |
| 1229 | + self.assertEqual(str(tf), str(tf2)) |
1172 | 1230 |
|
1173 | 1231 |
|
1174 | 1232 | class FeatureTests(FeatureTestsMixin, SparkSessionTestCase):
|
|
0 commit comments