File tree Expand file tree Collapse file tree 2 files changed +16
-4
lines changed
Expand file tree Collapse file tree 2 files changed +16
-4
lines changed Original file line number Diff line number Diff line change @@ -1207,10 +1207,9 @@ def factorize(
12071207 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
12081208 data = data .cast (pa .int64 ())
12091209
1210- if pa .types .is_dictionary (data .type ):
1211- encoded = data
1212- else :
1213- encoded = data .dictionary_encode (null_encoding = null_encoding )
1210+ if pa .types .is_dictionary (data .type ) and null_encoding == "encode" :
1211+ data = data .cast (data .type .value_type )
1212+ encoded = data .dictionary_encode (null_encoding = null_encoding )
12141213 if encoded .length () == 0 :
12151214 indices = np .array ([], dtype = np .intp )
12161215 uniques = type (self )(pa .chunked_array ([], type = encoded .type .value_type ))
Original file line number Diff line number Diff line change @@ -3329,6 +3329,19 @@ def test_factorize_chunked_dictionary():
33293329 tm .assert_index_equal (res_uniques , exp_uniques )
33303330
33313331
3332+ def test_factorize_dictionary_with_na ():
3333+ # Test that factorize properly handles NA values in dictionary arrays
3334+ arr = pd .array (
3335+ ["a1" , pd .NA ], dtype = ArrowDtype (pa .dictionary (pa .int32 (), pa .utf8 ()))
3336+ )
3337+ # Test with use_na_sentinel=False
3338+ indices , uniques = arr .factorize (use_na_sentinel = False )
3339+ expected_indices = np .array ([0 , 1 ], dtype = np .intp )
3340+ expected_uniques = pd .array (["a1" , None ], dtype = ArrowDtype (pa .string ()))
3341+ tm .assert_numpy_array_equal (indices , expected_indices )
3342+ tm .assert_extension_array_equal (uniques , expected_uniques )
3343+
3344+
33323345def test_dictionary_astype_categorical ():
33333346 # GH#56672
33343347 arrs = [
You can’t perform that action at this time.
0 commit comments