manage ugly name collisions

nicolaskruchten · nicolaskruchten · commit f73383101c49 · 2020-04-30T16:33:24.000-04:00
diff --git a/packages/python/plotly/plotly/express/_core.py b/packages/python/plotly/plotly/express/_core.py
@@ -970,27 +970,24 @@ def _isinstance_listlike(x):
         return True
 
 
-def process_args_into_dataframe(args, wide_mode, var_name):
+def _escape_col_name(df_input, col_name):
+    while df_input is not None and col_name in df_input.columns:
+        col_name = "_" + col_name
+    return col_name
+
+
+def process_args_into_dataframe(args, wide_mode, var_name, value_name):
     """
     After this function runs, the `all_attrables` keys of `args` all contain only
     references to columns of `df_output`. This function handles the extraction of data
     from `args["attrable"]` and column-name-generation as appropriate, and adds the
     data to `df_output` and then replaces `args["attrable"]` with the appropriate
     reference.
     """
-    for field in args:
-        if field in array_attrables and args[field] is not None:
-            args[field] = (
-                OrderedDict(args[field])
-                if isinstance(args[field], dict)
-                else list(args[field])
-            )
-    # Cast data_frame argument to DataFrame (it could be a numpy array, dict etc.)
-    df_provided = args["data_frame"] is not None
-    if df_provided and not isinstance(args["data_frame"], pd.DataFrame):
-        args["data_frame"] = pd.DataFrame(args["data_frame"])
+
     df_input = args["data_frame"]
     df_provided = df_input is not None
+
     df_output = pd.DataFrame()
     constants = dict()
     ranges = list()
@@ -1083,7 +1080,7 @@ def process_args_into_dataframe(args, wide_mode, var_name):
                     )
                 # Check validity of column name
                 if argument not in df_input.columns:
-                    if wide_mode and argument in ("value", var_name):
+                    if wide_mode and argument in (value_name, var_name):
                         continue
                     else:
                         err_msg = (
@@ -1205,10 +1202,11 @@ def build_dataframe(args, constructor):
     wide_y = False if no_y else _is_col_list(df_input, args["y"])
 
     wide_mode = False
-    var_name = None
+    var_name = None  # will likely be "variable" in wide_mode
+    wide_cross_name = None  # will likely be "index" in wide_mode
+    value_name = "value"
     hist2d_types = [go.Histogram2d, go.Histogram2dContour]
     if constructor in cartesians:
-        wide_cross_name = None
         if wide_x and wide_y:
             raise ValueError(
                 "Cannot accept list of column references or list of columns for both `x` and `y`."
@@ -1266,26 +1264,33 @@ def build_dataframe(args, constructor):
                 args["wide_cross"] = df_input.index
                 wide_cross_name = df_input.index.name or "index"
             else:
-                args["wide_cross"] = Range(label="index")
-                wide_cross_name = "index"
+                wide_cross_name = _escape_col_name(df_input, "index")
+                args["wide_cross"] = Range(label=wide_cross_name)
+
+    if wide_mode:
+        var_name = _escape_col_name(df_input, var_name)
+        value_name = _escape_col_name(df_input, value_name)
 
     # now that things have been prepped, we do the systematic rewriting of `args`
 
-    df_output, wide_id_vars = process_args_into_dataframe(args, wide_mode, var_name)
+    df_output, wide_id_vars = process_args_into_dataframe(
+        args, wide_mode, var_name, value_name
+    )
 
     # now that `df_output` exists and `args` contains only references, we complete
     # the special-case and wide-mode handling by further rewriting args and/or mutating
     # df_output
 
+    count_name = _escape_col_name(df_output, "count")
     if not wide_mode and missing_bar_dim and constructor == go.Bar:
         # now that we've populated df_output, we check to see if the non-missing
         # dimension is categorical: if so, then setting the missing dimension to a
         # constant 1 is a less-insane thing to do than setting it to the index by
         # default and we let the normal auto-orientation-code do its thing later
         other_dim = "x" if missing_bar_dim == "y" else "y"
         if not _is_continuous(df_output, args[other_dim]):
-            args[missing_bar_dim] = "count"
-            df_output["count"] = 1
+            args[missing_bar_dim] = count_name
+            df_output[count_name] = 1
         else:
             # on the other hand, if the non-missing dimension is continuous, then we
             # can use this information to override the normal auto-orientation code
@@ -1306,7 +1311,7 @@ def build_dataframe(args, constructor):
             id_vars=wide_id_vars,
             value_vars=wide_value_vars,
             var_name=var_name,
-            value_name="value",
+            value_name=value_name,
         )
         df_output[var_name] = df_output[var_name].astype(str)
         orient_v = wide_orientation == "v"
@@ -1317,24 +1322,24 @@ def build_dataframe(args, constructor):
 
         if constructor in [go.Scatter, go.Funnel] + hist2d_types:
             args["x" if orient_v else "y"] = wide_cross_name
-            args["y" if orient_v else "x"] = "value"
+            args["y" if orient_v else "x"] = value_name
             if constructor != go.Histogram2d:
                 args["color"] = args["color"] or var_name
         if constructor == go.Bar:
-            if _is_continuous(df_output, "value"):
+            if _is_continuous(df_output, value_name):
                 args["x" if orient_v else "y"] = wide_cross_name
-                args["y" if orient_v else "x"] = "value"
+                args["y" if orient_v else "x"] = value_name
                 args["color"] = args["color"] or var_name
             else:
-                args["x" if orient_v else "y"] = "value"
-                args["y" if orient_v else "x"] = "count"
-                df_output["count"] = 1
+                args["x" if orient_v else "y"] = value_name
+                args["y" if orient_v else "x"] = count_name
+                df_output[count_name] = 1
                 args["color"] = args["color"] or var_name
         if constructor in [go.Violin, go.Box]:
             args["x" if orient_v else "y"] = wide_cross_name or var_name
-            args["y" if orient_v else "x"] = "value"
+            args["y" if orient_v else "x"] = value_name
         if constructor == go.Histogram:
-            args["x" if orient_v else "y"] = "value"
+            args["x" if orient_v else "y"] = value_name
             args["y" if orient_v else "x"] = wide_cross_name
             args["color"] = args["color"] or var_name
 
diff --git a/packages/python/plotly/plotly/tests/test_core/test_px/test_px_input.py b/packages/python/plotly/plotly/tests/test_core/test_px/test_px_input.py
@@ -38,9 +38,7 @@ def test_with_index():
     # We do not allow "x=index"
     with pytest.raises(ValueError) as err_msg:
         fig = px.scatter(tips, x="index", y="total_bill")
-        assert "To use the index, pass it in directly as `df.index`." in str(
-            err_msg.value
-        )
+    assert "To use the index, pass it in directly as `df.index`." in str(err_msg.value)
     tips = px.data.tips()
     tips.index.name = "item"
     fig = px.scatter(tips, x=tips.index, y="total_bill")
@@ -75,10 +73,10 @@ def test_several_dataframes():
     # Name conflict
     with pytest.raises(NameError) as err_msg:
         fig = px.scatter(df, x="z", y=df2.money, size="y")
-        assert "A name conflict was encountered for argument y" in str(err_msg.value)
+    assert "A name conflict was encountered for argument y" in str(err_msg.value)
     with pytest.raises(NameError) as err_msg:
         fig = px.scatter(df, x="z", y=df2.money, size=df.y)
-        assert "A name conflict was encountered for argument y" in str(err_msg.value)
+    assert "A name conflict was encountered for argument y" in str(err_msg.value)
 
     # No conflict when the dataframe is not given, fields are used
     df = pd.DataFrame(dict(x=[0, 1], y=[3, 4]))
@@ -157,41 +155,41 @@ def test_arrayattrable_numpy():
 def test_wrong_column_name():
     with pytest.raises(ValueError) as err_msg:
         px.scatter(px.data.tips(), x="bla", y="wrong")
-        assert "Value of 'x' is not the name of a column in 'data_frame'" in str(
-            err_msg.value
-        )
+    assert "Value of 'x' is not the name of a column in 'data_frame'" in str(
+        err_msg.value
+    )
 
 
 def test_missing_data_frame():
     with pytest.raises(ValueError) as err_msg:
         px.scatter(x="arg1", y="arg2")
-        assert "String or int arguments are only possible" in str(err_msg.value)
+    assert "String or int arguments are only possible" in str(err_msg.value)
 
 
 def test_wrong_dimensions_of_array():
     with pytest.raises(ValueError) as err_msg:
         px.scatter(x=[1, 2, 3], y=[2, 3, 4, 5])
-        assert "All arguments should have the same length." in str(err_msg.value)
+    assert "All arguments should have the same length." in str(err_msg.value)
 
 
 def test_wrong_dimensions_mixed_case():
     with pytest.raises(ValueError) as err_msg:
         df = pd.DataFrame(dict(time=[1, 2, 3], temperature=[20, 30, 25]))
         px.scatter(df, x="time", y="temperature", color=[1, 3, 9, 5])
-        assert "All arguments should have the same length." in str(err_msg.value)
+    assert "All arguments should have the same length." in str(err_msg.value)
 
 
 def test_wrong_dimensions():
     with pytest.raises(ValueError) as err_msg:
         px.scatter(px.data.tips(), x="tip", y=[1, 2, 3])
-        assert "All arguments should have the same length." in str(err_msg.value)
+    assert "All arguments should have the same length." in str(err_msg.value)
     # the order matters
     with pytest.raises(ValueError) as err_msg:
         px.scatter(px.data.tips(), x=[1, 2, 3], y="tip")
-        assert "All arguments should have the same length." in str(err_msg.value)
+    assert "All arguments should have the same length." in str(err_msg.value)
     with pytest.raises(ValueError):
         px.scatter(px.data.tips(), x=px.data.iris().index, y="tip")
-        # assert "All arguments should have the same length." in str(err_msg.value)
+    assert "All arguments should have the same length." in str(err_msg.value)
 
 
 def test_multiindex_raise_error():
@@ -203,9 +201,7 @@ def test_multiindex_raise_error():
     px.scatter(df, x="A", y="B")
     with pytest.raises(TypeError) as err_msg:
         px.scatter(df, x=df.index, y="B")
-        assert "pandas MultiIndex is not supported by plotly express" in str(
-            err_msg.value
-        )
+    assert "pandas MultiIndex is not supported by plotly express" in str(err_msg.value)
 
 
 def test_build_df_from_lists():
diff --git a/packages/python/plotly/plotly/tests/test_core/test_px/test_px_wide.py b/packages/python/plotly/plotly/tests/test_core/test_px/test_px_wide.py
@@ -635,14 +635,20 @@ def test_multi_index():
     df.index = [["a", "a", "b", "b"], ["c", "d", "c", "d"]]
     with pytest.raises(TypeError) as err_msg:
         px.scatter(df)
-        assert "pandas MultiIndex is not supported by plotly express" in str(
-            err_msg.value
-        )
+    assert "pandas MultiIndex is not supported by plotly express" in str(err_msg.value)
 
     df = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 2, 3, 4], [3, 4, 5, 6]])
     df.columns = [["e", "e", "f", "f"], ["g", "h", "g", "h"]]
     with pytest.raises(TypeError) as err_msg:
         px.scatter(df)
-        assert "pandas MultiIndex is not supported by plotly express" in str(
-            err_msg.value
-        )
+    assert "pandas MultiIndex is not supported by plotly express" in str(err_msg.value)
+
+
+def test_special_name_collisions():
+    df = pd.DataFrame(
+        dict(a=range(10), b=range(10), value=range(10), variable=range(10))
+    )
+    args_in = dict(data_frame=df, color="value", symbol="variable")
+    args_out = build_dataframe(args_in, go.Scatter)
+    df_out = args_out["data_frame"]
+    assert len(set(df_out.columns)) == len(df_out.columns)