[SPARK-46065][PS] Refactor (DataFrame|Series).factorize() to use create_map

itholic · HyukjinKwon · commit 4c36ca3f6577 · 2023-11-23T20:36:54.000+09:00
### What changes were proposed in this pull request? This PR proposes to refactor `(DataFrame|Series).factorize()` to use `create_map`. ### Why are the changes needed? To optimize performance by using official API and also improve the readability. ### Does this PR introduce _any_ user-facing change? No, it's internal refactoring. ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43970 from itholic/refactor_factorize. Authored-by: Haejoon Lee <haejoon.lee@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/python/pyspark/pandas/base.py b/python/pyspark/pandas/base.py
@@ -1672,16 +1672,9 @@ def factorize(
         if len(kvs) == 0:  # uniques are all missing values
             new_scol = F.lit(na_sentinel_code)
         else:
+            map_scol = F.create_map(*kvs)
             null_scol = F.when(self.isnull().spark.column, F.lit(na_sentinel_code))
-            mapped_scol = None
-            for i in range(0, len(kvs), 2):
-                key = kvs[i]
-                value = kvs[i + 1]
-                if mapped_scol is None:
-                    mapped_scol = F.when(self.spark.column == key, value)
-                else:
-                    mapped_scol = mapped_scol.when(self.spark.column == key, value)
-            new_scol = null_scol.otherwise(mapped_scol)
+            new_scol = null_scol.otherwise(map_scol[self.spark.column])
 
         codes = self._with_new_scol(new_scol.alias(self._internal.data_spark_column_names[0]))