@@ -162,21 +162,28 @@ def test_merge_index_singlekey_inner(self):
162
162
{
163
163
"key" : ["a" , "b" , "c" , "d" , "e" , "e" , "a" ],
164
164
"v1" : np .random .default_rng (2 ).standard_normal (7 ),
165
+ "left_index" : range (7 ),
165
166
}
166
167
)
168
+
167
169
right = DataFrame (
168
170
{"v2" : np .random .default_rng (2 ).standard_normal (4 )},
169
171
index = ["d" , "b" , "c" , "a" ],
170
172
)
171
173
172
174
# inner join
173
175
result = merge (left , right , left_on = "key" , right_index = True , how = "inner" )
174
- expected = left .join (right , on = "key" ).loc [result .index ]
175
- tm .assert_frame_equal (result , expected )
176
+ expected = left .join (right , on = "key" ).loc [result ["left_index" ]]
177
+ tm .assert_frame_equal (
178
+ result .reset_index (drop = True ), expected .reset_index (drop = True )
179
+ )
176
180
177
181
result = merge (right , left , right_on = "key" , left_index = True , how = "inner" )
178
- expected = left .join (right , on = "key" ).loc [result .index ]
179
- tm .assert_frame_equal (result , expected .loc [:, result .columns ])
182
+ expected = left .join (right , on = "key" ).loc [result ["left_index" ]]
183
+ tm .assert_frame_equal (
184
+ result .reset_index (drop = True ),
185
+ expected .loc [:, result .columns ].reset_index (drop = True ),
186
+ )
180
187
181
188
def test_merge_misspecified (self , df , df2 , left ):
182
189
right = DataFrame (
@@ -349,8 +356,9 @@ def test_handle_join_key_pass_array(self):
349
356
right = DataFrame ({"rvalue" : np .arange (6 )})
350
357
351
358
key = np .array ([0 , 1 , 1 , 2 , 2 , 3 ], dtype = np .int64 )
359
+ index = np .array ([0 , 1 , 1 , 2 , 2 , np .nan ], dtype = np .float64 )
352
360
merged = merge (left , right , left_index = True , right_on = key , how = "outer" )
353
- tm .assert_series_equal (merged ["key_0" ], Series (key , name = "key_0" ))
361
+ tm .assert_series_equal (merged ["key_0" ], Series (key , index = index , name = "key_0" ))
354
362
355
363
def test_no_overlap_more_informative_error (self ):
356
364
dt = datetime .now ()
@@ -453,6 +461,9 @@ def test_merge_left_empty_right_notempty(self):
453
461
)
454
462
exp_in = exp_out [0 :0 ] # make empty DataFrame keeping dtype
455
463
464
+ exp_nan = exp_out .copy ()
465
+ exp_nan .index = [np .nan ] * 3
466
+
456
467
def check1 (exp , kwarg ):
457
468
result = merge (left , right , how = "inner" , ** kwarg )
458
469
tm .assert_frame_equal (result , exp )
@@ -465,12 +476,13 @@ def check2(exp, kwarg):
465
476
result = merge (left , right , how = "outer" , ** kwarg )
466
477
tm .assert_frame_equal (result , exp )
467
478
468
- for kwarg in [
469
- {"left_index" : True , "right_index" : True },
470
- {"left_index" : True , "right_on" : "x" },
471
- ]:
472
- check1 (exp_in , kwarg )
473
- check2 (exp_out , kwarg )
479
+ kwarg = {"left_index" : True , "right_on" : "x" }
480
+ check1 (exp_in , kwarg )
481
+ check2 (exp_nan , kwarg )
482
+
483
+ kwarg = {"left_index" : True , "right_index" : True }
484
+ check1 (exp_in , kwarg )
485
+ check2 (exp_out , kwarg )
474
486
475
487
kwarg = {"left_on" : "a" , "right_index" : True }
476
488
check1 (exp_in , kwarg )
@@ -762,6 +774,7 @@ def test_other_datetime_unit(self, unit):
762
774
"days" : days ,
763
775
},
764
776
columns = ["entity_id" , "days" ],
777
+ index = [101 , 102 ],
765
778
)
766
779
assert exp ["days" ].dtype == exp_dtype
767
780
tm .assert_frame_equal (result , exp )
@@ -789,6 +802,7 @@ def test_other_timedelta_unit(self, unit):
789
802
exp = DataFrame (
790
803
{"entity_id" : [101 , 102 ], "days" : np .array (["nat" , "nat" ], dtype = dtype )},
791
804
columns = ["entity_id" , "days" ],
805
+ index = [101 , 102 ],
792
806
)
793
807
tm .assert_frame_equal (result , exp )
794
808
@@ -1190,7 +1204,7 @@ def test_validation(self):
1190
1204
"c" : ["meow" , "bark" , "um... weasel noise?" , "nay" ],
1191
1205
},
1192
1206
columns = ["b" , "a" , "c" ],
1193
- index = range ( 4 ),
1207
+ index = Index ([ "a" , "b" , "c" , "d" ], name = "a" ),
1194
1208
)
1195
1209
1196
1210
left_index_reset = left .set_index ("a" )
@@ -1331,48 +1345,17 @@ def test_merge_two_empty_df_no_division_error(self):
1331
1345
1332
1346
@pytest .mark .parametrize ("how" , ["right" , "outer" ])
1333
1347
@pytest .mark .parametrize (
1334
- "index,expected_index " ,
1348
+ "index" ,
1335
1349
[
1336
- (
1337
- CategoricalIndex ([1 , 2 , 4 ]),
1338
- CategoricalIndex ([1 , 2 , 4 , None , None , None ]),
1339
- ),
1340
- (
1341
- DatetimeIndex (
1342
- ["2001-01-01" , "2002-02-02" , "2003-03-03" ], dtype = "M8[ns]"
1343
- ),
1344
- DatetimeIndex (
1345
- ["2001-01-01" , "2002-02-02" , "2003-03-03" , pd .NaT , pd .NaT , pd .NaT ],
1346
- dtype = "M8[ns]" ,
1347
- ),
1348
- ),
1349
- * [
1350
- (
1351
- Index ([1 , 2 , 3 ], dtype = dtyp ),
1352
- Index ([1 , 2 , 3 , None , None , None ], dtype = np .float64 ),
1353
- )
1354
- for dtyp in tm .ALL_REAL_NUMPY_DTYPES
1355
- ],
1356
- (
1357
- IntervalIndex .from_tuples ([(1 , 2 ), (2 , 3 ), (3 , 4 )]),
1358
- IntervalIndex .from_tuples (
1359
- [(1 , 2 ), (2 , 3 ), (3 , 4 ), np .nan , np .nan , np .nan ]
1360
- ),
1361
- ),
1362
- (
1363
- PeriodIndex (["2001-01-01" , "2001-01-02" , "2001-01-03" ], freq = "D" ),
1364
- PeriodIndex (
1365
- ["2001-01-01" , "2001-01-02" , "2001-01-03" , pd .NaT , pd .NaT , pd .NaT ],
1366
- freq = "D" ,
1367
- ),
1368
- ),
1369
- (
1370
- TimedeltaIndex (["1D" , "2D" , "3D" ]),
1371
- TimedeltaIndex (["1D" , "2D" , "3D" , pd .NaT , pd .NaT , pd .NaT ]),
1372
- ),
1350
+ CategoricalIndex ([1 , 2 , 4 ]),
1351
+ DatetimeIndex (["2001-01-01" , "2002-02-02" , "2003-03-03" ], dtype = "M8[ns]" ),
1352
+ * [Index ([1 , 2 , 3 ], dtype = dtyp ) for dtyp in tm .ALL_REAL_NUMPY_DTYPES ],
1353
+ IntervalIndex .from_tuples ([(1 , 2 ), (2 , 3 ), (3 , 4 )]),
1354
+ PeriodIndex (["2001-01-01" , "2001-01-02" , "2001-01-03" ], freq = "D" ),
1355
+ TimedeltaIndex (["1D" , "2D" , "3D" ]),
1373
1356
],
1374
1357
)
1375
- def test_merge_on_index_with_more_values (self , how , index , expected_index ):
1358
+ def test_merge_on_index_with_more_values (self , how , index ):
1376
1359
# GH 24212
1377
1360
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
1378
1361
# -1 is interpreted as a missing value instead of the last element
@@ -1390,20 +1373,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
1390
1373
],
1391
1374
columns = ["a" , "key" , "b" ],
1392
1375
)
1393
- expected .set_index (expected_index , inplace = True )
1394
1376
tm .assert_frame_equal (result , expected )
1395
1377
1396
1378
def test_merge_right_index_right (self ):
1397
- # Note: the expected output here is probably incorrect.
1398
- # See https://github.com/pandas-dev/pandas/issues/17257 for more.
1399
- # We include this as a regression test for GH-24897.
1379
+ # Regression test for GH-24897.
1400
1380
left = DataFrame ({"a" : [1 , 2 , 3 ], "key" : [0 , 1 , 1 ]})
1401
1381
right = DataFrame ({"b" : [1 , 2 , 3 ]})
1402
1382
1403
1383
expected = DataFrame (
1404
1384
{"a" : [1 , 2 , 3 , None ], "key" : [0 , 1 , 1 , 2 ], "b" : [1 , 2 , 2 , 3 ]},
1405
1385
columns = ["a" , "key" , "b" ],
1406
- index = [0 , 1 , 2 , np . nan ],
1386
+ index = [0 , 1 , 1 , 2 ],
1407
1387
)
1408
1388
result = left .merge (right , left_on = "key" , right_index = True , how = "right" )
1409
1389
tm .assert_frame_equal (result , expected )
@@ -1436,7 +1416,7 @@ def test_merge_take_missing_values_from_index_of_other_dtype(self):
1436
1416
"key" : Categorical (["a" , "a" , "b" , "c" ]),
1437
1417
"b" : [1 , 1 , 2 , 3 ],
1438
1418
},
1439
- index = [ 0 , 1 , 2 , np . nan ] ,
1419
+ index = Categorical ([ "a" , "a" , "b" , "c" ], categories = list ( "abc" )) ,
1440
1420
)
1441
1421
expected = expected .reindex (columns = ["a" , "key" , "b" ])
1442
1422
tm .assert_frame_equal (result , expected )
@@ -2661,7 +2641,8 @@ def test_merge_right_left_index():
2661
2641
"z_x" : ["foo" , "foo" ],
2662
2642
"x_y" : [1 , 1 ],
2663
2643
"z_y" : ["foo" , "foo" ],
2664
- }
2644
+ },
2645
+ index = [1 , 1 ],
2665
2646
)
2666
2647
tm .assert_frame_equal (result , expected )
2667
2648
@@ -2670,7 +2651,7 @@ def test_merge_result_empty_index_and_on():
2670
2651
# GH#33814
2671
2652
df1 = DataFrame ({"a" : [1 ], "b" : [2 ]}).set_index (["a" , "b" ])
2672
2653
df2 = DataFrame ({"b" : [1 ]}).set_index (["b" ])
2673
- expected = DataFrame ({"a" : [], " b" : []}, dtype = np .int64 ).set_index (["a" , "b" ])
2654
+ expected = DataFrame ({"b" : []}, dtype = np .int64 ).set_index (["b" ])
2674
2655
result = merge (df1 , df2 , left_on = ["b" ], right_index = True )
2675
2656
tm .assert_frame_equal (result , expected )
2676
2657
@@ -2850,7 +2831,9 @@ def test_merge_multiindex_single_level():
2850
2831
data = {"b" : [100 ]},
2851
2832
index = MultiIndex .from_tuples ([("A" ,), ("C" ,)], names = ["col" ]),
2852
2833
)
2853
- expected = DataFrame ({"col" : ["A" , "B" ], "b" : [100 , np .nan ]})
2834
+ expected = DataFrame (
2835
+ {"col" : ["A" , "B" ], "b" : [100 , np .nan ]}, index = Index ([("A" ,), np .nan ])
2836
+ )
2854
2837
2855
2838
result = df .merge (df2 , left_on = ["col" ], right_index = True , how = "left" )
2856
2839
tm .assert_frame_equal (result , expected )
@@ -2957,14 +2940,20 @@ def test_merge_ea_int_and_float_numpy():
2957
2940
tm .assert_frame_equal (result , expected .astype ("float64" ))
2958
2941
2959
2942
2943
+ from pandas .core .dtypes .missing import na_value_for_dtype
2944
+
2945
+
2960
2946
def test_merge_arrow_string_index (any_string_dtype ):
2961
2947
# GH#54894
2962
2948
pytest .importorskip ("pyarrow" )
2963
2949
left = DataFrame ({"a" : ["a" , "b" ]}, dtype = any_string_dtype )
2964
2950
right = DataFrame ({"b" : 1 }, index = Index (["a" , "c" ], dtype = any_string_dtype ))
2965
2951
result = left .merge (right , left_on = "a" , right_index = True , how = "left" )
2966
2952
expected = DataFrame (
2967
- {"a" : Series (["a" , "b" ], dtype = any_string_dtype ), "b" : [1 , np .nan ]}
2953
+ {"a" : Series (["a" , "b" ], dtype = any_string_dtype ), "b" : [1.0 , np .nan ]},
2954
+ )
2955
+ expected .index = Index (["a" ], dtype = any_string_dtype ).append (
2956
+ Index ([na_value_for_dtype (any_string_dtype )])
2968
2957
)
2969
2958
tm .assert_frame_equal (result , expected )
2970
2959
@@ -3022,3 +3011,12 @@ def test_merge_on_all_nan_column():
3022
3011
{"x" : [1 , 2 , 3 ], "y" : [np .nan , np .nan , np .nan ], "z" : [4 , 5 , 6 ], "zz" : [4 , 5 , 6 ]}
3023
3012
)
3024
3013
tm .assert_frame_equal (result , expected )
3014
+
3015
+
3016
+ def test_merge_index ():
3017
+ # GH 57291
3018
+ dfa = DataFrame (range (10 ), columns = ["a" ])
3019
+ dfb = DataFrame ({"b" : range (5 ), "key" : [5 + x for x in range (5 )]})
3020
+
3021
+ result = dfa .merge (dfb , left_index = True , right_on = "key" , how = "left" )
3022
+ tm .assert_index_equal (result .index , dfa .index )
0 commit comments