TST: Remove tm.rands/rands_array (#54368)

mroeschke · web-flow · commit 818618e97d5f · 2023-08-03T13:24:57.000-07:00
* remove tm.rands

* remove rands array

* Address failures

* Use unique values
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -2,8 +2,6 @@
 
 import pandas as pd
 
-from .pandas_vb_common import tm
-
 
 class BooleanArray:
     def setup(self):
@@ -56,7 +54,7 @@ def time_from_tuples(self):
 class StringArray:
     def setup(self):
         N = 100_000
-        values = tm.rands_array(3, N)
+        values = np.array([str(i) for i in range(N)], dtype=object)
         self.values_obj = np.array(values, dtype="object")
         self.values_str = np.array(values, dtype="U")
         self.values_list = values.tolist()
@@ -80,7 +78,7 @@ def setup(self, multiple_chunks):
             import pyarrow as pa
         except ImportError:
             raise NotImplementedError
-        strings = tm.rands_array(3, 10_000)
+        strings = np.array([str(i) for i in range(10_000)], dtype=object)
         if multiple_chunks:
             chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
             self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks))
@@ -127,7 +125,7 @@ def setup(self, dtype, hasna):
         elif dtype == "int64[pyarrow]":
             data = np.arange(N)
         elif dtype == "string[pyarrow]":
-            data = tm.rands_array(10, N)
+            data = np.array([str(i) for i in range(N)], dtype=object)
         elif dtype == "timestamp[ns][pyarrow]":
             data = pd.date_range("2000-01-01", freq="s", periods=N)
         else:
diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py
@@ -104,7 +104,7 @@ def setup(self, dtype):
             data = np.arange(N)
             na_value = NA
         elif dtype in ("string", "string[pyarrow]"):
-            data = tm.rands_array(5, N)
+            data = np.array([str(i) * 5 for i in range(N)], dtype=object)
             na_value = NA
         else:
             raise NotImplementedError
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -34,8 +34,8 @@ class Construction:
     dtype_mapping = {"str": "str", "string[python]": object, "string[pyarrow]": object}
 
     def setup(self, pd_type, dtype):
-        series_arr = tm.rands_array(
-            nchars=10, size=10**5, dtype=self.dtype_mapping[dtype]
+        series_arr = np.array(
+            [str(i) * 10 for i in range(100_000)], dtype=self.dtype_mapping[dtype]
         )
         if pd_type == "series":
             self.arr = series_arr
@@ -276,7 +276,7 @@ def time_iter(self, dtype):
 
 class StringArrayConstruction:
     def setup(self):
-        self.series_arr = tm.rands_array(nchars=10, size=10**5)
+        self.series_arr = np.array([str(i) * 10 for i in range(10**5)], dtype=object)
         self.series_arr_nan = np.concatenate([self.series_arr, np.array([NA] * 1000)])
 
     def time_string_array_construction(self):
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -54,10 +54,6 @@
     round_trip_pickle,
     write_to_compressed,
 )
-from pandas._testing._random import (
-    rands,
-    rands_array,
-)
 from pandas._testing._warnings import (
     assert_produces_warning,
     maybe_produces_warning,
@@ -349,6 +345,22 @@ def to_array(obj):
 # Others
 
 
+def rands_array(
+    nchars, size: int, dtype: NpDtype = "O", replace: bool = True
+) -> np.ndarray:
+    """
+    Generate an array of byte strings.
+    """
+    chars = np.array(list(string.ascii_letters + string.digits), dtype=(np.str_, 1))
+    retval = (
+        np.random.default_rng(2)
+        .choice(chars, size=nchars * np.prod(size), replace=replace)
+        .view((np.str_, nchars))
+        .reshape(size)
+    )
+    return retval.astype(dtype)
+
+
 def getCols(k) -> str:
     return string.ascii_uppercase[:k]
 
@@ -1127,7 +1139,6 @@ def shares_memory(left, right) -> bool:
     "NULL_OBJECTS",
     "OBJECT_DTYPES",
     "raise_assert_detail",
-    "rands",
     "reset_display_options",
     "raises_chained_assignment_error",
     "round_trip_localpath",
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
@@ -9,6 +9,7 @@
     Any,
     Callable,
 )
+import uuid
 import zipfile
 
 from pandas.compat import (
@@ -18,7 +19,6 @@
 from pandas.compat._optional import import_optional_dependency
 
 import pandas as pd
-from pandas._testing._random import rands
 from pandas._testing.contexts import ensure_clean
 
 if TYPE_CHECKING:
@@ -56,7 +56,7 @@ def round_trip_pickle(
     """
     _path = path
     if _path is None:
-        _path = f"__{rands(10)}__.pickle"
+        _path = f"__{uuid.uuid4()}__.pickle"
     with ensure_clean(_path) as temp_path:
         pd.to_pickle(obj, temp_path)
         return pd.read_pickle(temp_path)
diff --git a/pandas/_testing/_random.py b/pandas/_testing/_random.py
diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
@@ -881,7 +881,7 @@ def test_add_frames(self, first, second, expected):
     # TODO: This came from series.test.test_operators, needs cleanup
     def test_series_frame_radd_bug(self, fixed_now_ts):
         # GH#353
-        vals = Series(tm.rands_array(5, 10))
+        vals = Series(tm.makeStringIndex())
         result = "foo_" + vals
         expected = vals.map(lambda x: "foo_" + x)
         tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -272,7 +272,7 @@ def test_getitem_series_integer_with_missing_raises(self, data, idx):
         msg = "Cannot index with an integer indexer containing NA values"
         # TODO: this raises KeyError about labels not found (it tries label-based)
 
-        ser = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
+        ser = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
         with pytest.raises(ValueError, match=msg):
             ser[idx]
 
diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
@@ -197,7 +197,7 @@ def test_setitem_integer_with_missing_raises(self, data, idx, box_in_series):
         # TODO(xfail) this raises KeyError about labels not found (it tries label-based)
         # for list of labels with Series
         if box_in_series:
-            arr = pd.Series(data, index=[tm.rands(4) for _ in range(len(data))])
+            arr = pd.Series(data, index=[chr(100 + i) for i in range(len(data))])
 
         msg = "Cannot index with an integer indexer containing NA values"
         with pytest.raises(ValueError, match=msg):
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -203,7 +203,7 @@ def test_timestamp_compare(self, left, right):
                 "dates2": pd.date_range("20010102", periods=10),
                 "intcol": np.random.default_rng(2).integers(1000000000, size=10),
                 "floatcol": np.random.default_rng(2).standard_normal(10),
-                "stringcol": list(tm.rands(10)),
+                "stringcol": [chr(100 + i) for i in range(10)],
             }
         )
         df.loc[np.random.default_rng(2).random(len(df)) > 0.5, "dates2"] = pd.NaT
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
@@ -265,7 +265,7 @@ def test_str_to_bytes_raises(self):
     def test_very_wide_info_repr(self):
         df = DataFrame(
             np.random.default_rng(2).standard_normal((10, 20)),
-            columns=tm.rands_array(10, 20),
+            columns=np.array(["a" * 10] * 20, dtype=object),
         )
         repr(df)
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -1361,7 +1361,7 @@ def test_cython_grouper_series_bug_noncontig():
 
 
 def test_series_grouper_noncontig_index():
-    index = Index(tm.rands_array(10, 100))
+    index = Index(["a" * 10] * 100)
 
     values = Series(np.random.default_rng(2).standard_normal(50), index=index[::2])
     labels = np.random.default_rng(2).integers(0, 5, 50)
diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
@@ -31,8 +31,8 @@ def test_rank_unordered_categorical_typeerror():
 
 
 def test_rank_apply():
-    lev1 = tm.rands_array(10, 100)
-    lev2 = tm.rands_array(10, 130)
+    lev1 = np.array(["a" * 10] * 100, dtype=object)
+    lev2 = np.array(["b" * 10] * 130, dtype=object)
     lab1 = np.random.default_rng(2).integers(0, 100, size=500, dtype=int)
     lab2 = np.random.default_rng(2).integers(0, 130, size=500, dtype=int)
 
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
@@ -214,10 +214,9 @@ def test_repr_truncation(self):
                 {
                     "A": np.random.default_rng(2).standard_normal(10),
                     "B": [
-                        tm.rands(
-                            np.random.default_rng(2).integers(max_len - 1, max_len + 1)
-                        )
-                        for i in range(10)
+                        "a"
+                        * np.random.default_rng(2).integers(max_len - 1, max_len + 1)
+                        for _ in range(10)
                     ],
                 }
             )
@@ -1177,7 +1176,7 @@ def test_wide_repr(self):
             20,
         ):
             max_cols = get_option("display.max_columns")
-            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+            df = DataFrame([["a" * 25] * (max_cols - 1)] * 10)
             with option_context("display.expand_frame_repr", False):
                 rep_str = repr(df)
 
@@ -1203,7 +1202,7 @@ def test_wide_repr_wide_columns(self):
     def test_wide_repr_named(self):
         with option_context("mode.sim_interactive", True, "display.max_columns", 20):
             max_cols = get_option("display.max_columns")
-            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+            df = DataFrame([["a" * 25] * (max_cols - 1)] * 10)
             df.index.name = "DataFrame Index"
             with option_context("display.expand_frame_repr", False):
                 rep_str = repr(df)
@@ -1220,9 +1219,9 @@ def test_wide_repr_named(self):
 
     def test_wide_repr_multiindex(self):
         with option_context("mode.sim_interactive", True, "display.max_columns", 20):
-            midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10)))
+            midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2)
             max_cols = get_option("display.max_columns")
-            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)), index=midx)
+            df = DataFrame([["a" * 25] * (max_cols - 1)] * 10, index=midx)
             df.index.names = ["Level 0", "Level 1"]
             with option_context("display.expand_frame_repr", False):
                 rep_str = repr(df)
@@ -1240,10 +1239,10 @@ def test_wide_repr_multiindex(self):
     def test_wide_repr_multiindex_cols(self):
         with option_context("mode.sim_interactive", True, "display.max_columns", 20):
             max_cols = get_option("display.max_columns")
-            midx = MultiIndex.from_arrays(tm.rands_array(5, size=(2, 10)))
-            mcols = MultiIndex.from_arrays(tm.rands_array(3, size=(2, max_cols - 1)))
+            midx = MultiIndex.from_arrays([["a" * 5] * 10] * 2)
+            mcols = MultiIndex.from_arrays([["b" * 3] * (max_cols - 1)] * 2)
             df = DataFrame(
-                tm.rands_array(25, (10, max_cols - 1)), index=midx, columns=mcols
+                [["c" * 25] * (max_cols - 1)] * 10, index=midx, columns=mcols
             )
             df.index.names = ["Level 0", "Level 1"]
             with option_context("display.expand_frame_repr", False):
@@ -1259,7 +1258,7 @@ def test_wide_repr_multiindex_cols(self):
     def test_wide_repr_unicode(self):
         with option_context("mode.sim_interactive", True, "display.max_columns", 20):
             max_cols = 20
-            df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+            df = DataFrame([["a" * 25] * 10] * (max_cols - 1))
             with option_context("display.expand_frame_repr", False):
                 rep_str = repr(df)
             with option_context("display.expand_frame_repr", True):
@@ -1897,11 +1896,11 @@ def test_repr_html_mathjax(self):
 
     def test_repr_html_wide(self):
         max_cols = 20
-        df = DataFrame(tm.rands_array(25, size=(10, max_cols - 1)))
+        df = DataFrame([["a" * 25] * (max_cols - 1)] * 10)
         with option_context("display.max_rows", 60, "display.max_columns", 20):
             assert "..." not in df._repr_html_()
 
-        wide_df = DataFrame(tm.rands_array(25, size=(10, max_cols + 1)))
+        wide_df = DataFrame([["a" * 25] * (max_cols + 1)] * 10)
         with option_context("display.max_rows", 60, "display.max_columns", 20):
             assert "..." in wide_df._repr_html_()
 
@@ -1911,14 +1910,14 @@ def test_repr_html_wide_multiindex_cols(self):
         mcols = MultiIndex.from_product(
             [np.arange(max_cols // 2), ["foo", "bar"]], names=["first", "second"]
         )
-        df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols)
+        df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols)
         reg_repr = df._repr_html_()
         assert "..." not in reg_repr
 
         mcols = MultiIndex.from_product(
             (np.arange(1 + (max_cols // 2)), ["foo", "bar"]), names=["first", "second"]
         )
-        df = DataFrame(tm.rands_array(25, size=(10, len(mcols))), columns=mcols)
+        df = DataFrame([["a" * 25] * len(mcols)] * 10, columns=mcols)
         with option_context("display.max_rows", 60, "display.max_columns", 20):
             assert "..." in df._repr_html_()
 
diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py
@@ -153,7 +153,7 @@ def test_append_with_diff_col_name_types_raises_value_error(setup_path):
     df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)})
 
     with ensure_clean_store(setup_path) as store:
-        name = f"df_{tm.rands(10)}"
+        name = "df_diff_valerror"
         store.append(name, df)
 
         for d in (df2, df3, df4, df5):
diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py
@@ -54,9 +54,7 @@ def roundtrip(key, obj, **kwargs):
 
 def test_long_strings(setup_path):
     # GH6166
-    df = DataFrame(
-        {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10)
-    )
+    df = DataFrame({"a": tm.makeStringIndex(10)}, index=tm.makeStringIndex(10))
 
     with ensure_clean_store(setup_path) as store:
         store.append("df", df, data_columns=["a"])
diff --git a/pandas/tests/reshape/merge/test_multi.py b/pandas/tests/reshape/merge/test_multi.py
@@ -193,7 +193,7 @@ def test_merge_multiple_cols_with_mixed_cols_index(self):
 
     def test_compress_group_combinations(self):
         # ~ 40000000 possible unique groups
-        key1 = tm.rands_array(10, 10000)
+        key1 = tm.makeStringIndex(10000)
         key1 = np.tile(key1, 2)
         key2 = key1[::-1]
 
diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py
@@ -69,7 +69,7 @@ def test_getitem_unrecognized_scalar(self):
         assert result == 2
 
     def test_getitem_negative_out_of_bounds(self):
-        ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
+        ser = Series(["a"] * 10, index=["a"] * 10)
 
         msg = "index -11 is out of bounds for axis 0 with size 10"
         warn_msg = "Series.__getitem__ treating keys as positions is deprecated"
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
@@ -173,7 +173,7 @@ def test_object_series_setitem_dt64array_exact_match(self):
 
 class TestSetitemScalarIndexer:
     def test_setitem_negative_out_of_bounds(self):
-        ser = Series(tm.rands_array(5, 10), index=tm.rands_array(10, 10))
+        ser = Series(["a"] * 10, index=["a"] * 10)
 
         msg = "index -11 is out of bounds for axis 0 with size 10"
         warn_msg = "Series.__setitem__ treating keys as positions is deprecated"
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py

Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,7 @@ def test_timestamp_compare(self, left, right):`
`203`	`203`	`"dates2": pd.date_range("20010102", periods=10),`
`204`	`204`	`"intcol": np.random.default_rng(2).integers(1000000000, size=10),`
`205`	`205`	`"floatcol": np.random.default_rng(2).standard_normal(10),`
`206`		`- "stringcol": list(tm.rands(10)),`
	`206`	`+ "stringcol": [chr(100 + i) for i in range(10)],`
`207`	`207`	`}`
`208`	`208`	`)`
`209`	`209`	`df.loc[np.random.default_rng(2).random(len(df)) > 0.5, "dates2"] = pd.NaT`