- 
                Notifications
    
You must be signed in to change notification settings  - Fork 669
 
FIX-#4522: Correct multiindex metadata with groupby #4523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 7 commits
13395f0
              8040915
              6e8466e
              95f5510
              d2e8c5f
              a5bb81e
              5a11af0
              75c6089
              177bfe2
              a547d32
              3ce19ea
              0e4a521
              639a201
              d2c7295
              55ec3a1
              2e9a056
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -415,7 +415,6 @@ def groupby( | |
| ) | ||
| else: | ||
| squeeze = False | ||
| 
     | 
||
| axis = self._get_axis_number(axis) | ||
| idx_name = None | ||
| # Drop here indicates whether or not to drop the data column before doing the | ||
| 
        
          
        
         | 
    @@ -424,7 +423,16 @@ def groupby( | |
| # strings is passed in, the data used for the groupby is dropped before the | ||
| # groupby takes place. | ||
| drop = False | ||
| 
     | 
||
| # Check that there is no ambiguity in the parameter we were given. | ||
| _by_check = by if is_list_like(by) else [by] | ||
| for k in _by_check: | ||
| if k in self.index.names and k in self.axes[axis]: | ||
| level_name, index_name = "an index", "a column" | ||
| if axis == 1: | ||
| level_name, index_name = index_name, level_name | ||
| raise ValueError( | ||
                
      
                  RehanSD marked this conversation as resolved.
               
              
                Outdated
          
            Show resolved
            Hide resolved
                
       | 
||
| f"{k} is both {level_name} level and {index_name} label, which is ambiguous." | ||
| ) | ||
| if ( | ||
| not isinstance(by, (pandas.Series, Series)) | ||
| and is_list_like(by) | ||
| 
          
            
          
           | 
    ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| 
          
            
          
           | 
    @@ -1570,7 +1570,7 @@ def test_agg_exceptions(operation): | |
| }, | ||
| ], | ||
| ) | ||
| def test_to_pandas_convertion(kwargs): | ||
| def test_to_pandas_conversion(kwargs): | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick fix for a typo I noticed when updating the testing suite!  | 
||
| data = {"a": [1, 2], "b": [3, 4], "c": [5, 6]} | ||
| by = ["a", "b"] | ||
| 
     | 
||
| 
          
            
          
           | 
    @@ -2032,3 +2032,70 @@ def test_sum_with_level(): | |
| } | ||
| modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) | ||
| eval_general(modin_df, pandas_df, lambda df: df.set_index("C").groupby("C").sum()) | ||
| 
     | 
||
| 
     | 
||
| def test_reset_index_groupby(): | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. without context, this test is unclear. Could you add a brief description of what error condition this is testing or link to the gh issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, can do!  | 
||
| frame_data = np.random.randint(97, 198, size=(2**6, 2**4)) | ||
| pandas_df = pandas.DataFrame( | ||
| frame_data, | ||
| index=pandas.MultiIndex.from_tuples( | ||
| [(i // 4, i // 2, i) for i in range(2**6)] | ||
| ), | ||
| ).add_prefix("col") | ||
| pandas_df.index.names = [f"index_{i}" for i in range(len(pandas_df.index.names))] | ||
| # Convert every other column to string | ||
                
       | 
||
| for col in pandas_df.iloc[ | ||
| :, [i for i in range(len(pandas_df.columns)) if i % 2 == 0] | ||
| ]: | ||
| pandas_df[col] = [str(chr(i)) for i in pandas_df[col]] | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be helpful if you could describe the schema for  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense!  | 
||
| modin_df = from_pandas(pandas_df) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.reset_index().groupby(["index_0", "index_1"]).count(), | ||
| ) | ||
| 
     | 
||
| def test_by_in_index_and_columns(): | ||
                
      
                  RehanSD marked this conversation as resolved.
               
          
            Show resolved
            Hide resolved
         | 
||
| pandas_df = pandas.DataFrame( | ||
| [[1, 2, 3]], index=pd.Index([0], name="a"), columns=['a', 'b', 'c'] | ||
| ) | ||
| modin_df = from_pandas(pandas_df) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.groupby(by='a').count(), | ||
| raising_exceptions=True, | ||
| check_exception_type=True, | ||
| ) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.groupby(by=['a', 'b']).count(), | ||
| raising_exceptions=True, | ||
| check_exception_type=True, | ||
| ) | ||
| pandas_df = pandas.DataFrame( | ||
| [[1, 2, 3]], index=pd.Index([(0, 1)], names=["a", 'b']), columns=['a', 'b', 'c'] | ||
| ) | ||
| modin_df = from_pandas(pandas_df) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.groupby(by='a').count(), | ||
| raising_exceptions=True, | ||
| check_exception_type=True, | ||
| ) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.groupby(by=['a', 'c']).count(), | ||
| raising_exceptions=True, | ||
| check_exception_type=True, | ||
| ) | ||
| eval_general( | ||
| modin_df, | ||
| pandas_df, | ||
| lambda df: df.groupby(by=['a', 'b']).count(), | ||
| raising_exceptions=True, | ||
| check_exception_type=True, | ||
| ) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: why are we removing the empty line?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will remove!