diff --git a/docs/tutorials/shape-creation.rst b/docs/tutorials/shape-creation.rst index 9290693f..2cd1d19d 100644 --- a/docs/tutorials/shape-creation.rst +++ b/docs/tutorials/shape-creation.rst @@ -53,7 +53,7 @@ various bits of information from the dataset, such as the automatically-calculat bounds (*e.g.*, :attr:`.Dataset.data_bounds`, which form the bounding box of the starting data, and :attr:`.Dataset.morph_bounds`, which define the limits of where the algorithm can move the points) or percentiles using the data itself (see -:attr:`.Dataset.df`). For example, the :class:`.XLines` shape inherits from +:attr:`.Dataset.data`). For example, the :class:`.XLines` shape inherits from :class:`.LineCollection` and uses the morph bounds (:attr:`.Dataset.morph_bounds`) to calculate its position and scale: diff --git a/pyproject.toml b/pyproject.toml index 6fb0ab8e..af881036 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,8 @@ lint.select = [ "FA", # flake8-future-annotations "I", # isort "N", # pep8-naming + "NPY", # numpy + "PD", # pandas-vet "PTH", # flake8-use-pathlib "RUF", # ruff-specific rules "SIM", # flake8-simplify diff --git a/src/data_morph/data/dataset.py b/src/data_morph/data/dataset.py index efcf3adf..3b2b2e6e 100644 --- a/src/data_morph/data/dataset.py +++ b/src/data_morph/data/dataset.py @@ -32,7 +32,7 @@ class Dataset: ---------- name : str The name to use for the dataset. - df : pandas.DataFrame + data : pandas.DataFrame DataFrame containing columns x and y. scale : numbers.Number, optional The factor to scale the data by (can be used to speed up morphing). @@ -49,10 +49,12 @@ class Dataset: def __init__( self, name: str, - df: pd.DataFrame, + data: pd.DataFrame, scale: Number | None = None, ) -> None: - self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale) + self.data: pd.DataFrame = self._validate_data(data).pipe( + self._scale_data, scale + ) """pandas.DataFrame: DataFrame containing columns x and y.""" self.name: str = name @@ -81,7 +83,7 @@ def _derive_data_bounds(self) -> BoundingBox: """ return BoundingBox( *[ - Interval([self.df[dim].min(), self.df[dim].max()], inclusive=False) + Interval([self.data[dim].min(), self.data[dim].max()], inclusive=False) for dim in self._REQUIRED_COLUMNS ] ) @@ -122,13 +124,13 @@ def _derive_plotting_bounds(self) -> BoundingBox: plot_bounds.align_aspect_ratio() return plot_bounds - def _scale_data(self, df: pd.DataFrame, scale: Number) -> pd.DataFrame: + def _scale_data(self, data: pd.DataFrame, scale: Number) -> pd.DataFrame: """ Apply scaling to the data. Parameters ---------- - df : pandas.DataFrame + data : pandas.DataFrame The data to scale. scale : numbers.Number, optional The factor to scale the data by (can be used to speed up morphing). @@ -141,7 +143,7 @@ def _scale_data(self, df: pd.DataFrame, scale: Number) -> pd.DataFrame: """ if scale is None: self._scaled = False - return df + return data if isinstance(scale, bool) or not isinstance(scale, Number): raise TypeError('scale must be a numeric value.') @@ -149,9 +151,9 @@ def _scale_data(self, df: pd.DataFrame, scale: Number) -> pd.DataFrame: if not scale: raise ValueError('scale must be non-zero.') - scaled_df = df.assign(x=df.x.div(scale), y=df.y.div(scale)) + scaled_data = data.assign(x=data.x.div(scale), y=data.y.div(scale)) self._scaled = True - return scaled_df + return scaled_data def _validate_data(self, data: pd.DataFrame) -> pd.DataFrame: """ @@ -211,7 +213,7 @@ def plot( fig.get_layout_engine().set(w_pad=0.2, h_pad=0.2) ax.axis('equal') - ax.scatter(self.df.x, self.df.y, s=2, color='black') + ax.scatter(self.data.x, self.data.y, s=2, color='black') ax.set(xlabel='', ylabel='', title=self if title == 'default' else title) if show_bounds: @@ -236,8 +238,8 @@ def plot( ) ) ax.text( - (self.df.x.max() + self.df.x.min()) / 2, - self.df.y.max() + self.data_bounds.y_bounds.range / scale_base, + (self.data.x.max() + self.data.x.min()) / 2, + self.data.y.max() + self.data_bounds.y_bounds.range / scale_base, 'DATA BOUNDS', color='blue', va='bottom', diff --git a/src/data_morph/data/loader.py b/src/data_morph/data/loader.py index ccb188c3..dfd62f7b 100644 --- a/src/data_morph/data/loader.py +++ b/src/data_morph/data/loader.py @@ -106,18 +106,18 @@ class directly. Path(cls._DATA_PATH) / cls._DATASETS[dataset] ) name = dataset - df = pd.read_csv(filepath) + data = pd.read_csv(filepath) except KeyError: try: name = Path(dataset).stem - df = pd.read_csv(dataset) + data = pd.read_csv(dataset) except FileNotFoundError as err: raise ValueError( f'Unknown dataset "{dataset}". ' 'Provide a valid path to a CSV dataset or use one of ' f'the included datasets: {", ".join(cls.AVAILABLE_DATASETS)}.' ) from err - return Dataset(name=name, df=df, scale=scale) + return Dataset(name=name, data=data, scale=scale) @classmethod @plot_with_custom_style @@ -166,7 +166,7 @@ def plot_available_datasets(cls) -> Axes: elif dataset == 'SDS': dataset += ' logo' - ax.scatter(points.df.x, points.df.y, s=4, color='black') + ax.scatter(points.data.x, points.data.y, s=4, color='black') # tight plot bounds for the grid of datasets in the docs bounds = points.data_bounds.clone() @@ -175,7 +175,7 @@ def plot_available_datasets(cls) -> Axes: bounds.align_aspect_ratio() ax.set( - title=f'{dataset} ({points.df.shape[0]:,d} points)', + title=f'{dataset} ({points.data.shape[0]:,d} points)', xlim=bounds.x_bounds, ylim=bounds.y_bounds, xlabel='', diff --git a/src/data_morph/data/stats.py b/src/data_morph/data/stats.py index d3c52669..8c267ca6 100644 --- a/src/data_morph/data/stats.py +++ b/src/data_morph/data/stats.py @@ -12,13 +12,13 @@ ) -def get_values(df: pd.DataFrame) -> SummaryStatistics: +def get_values(data: pd.DataFrame) -> SummaryStatistics: """ Calculate the summary statistics for the given set of points. Parameters ---------- - df : pandas.DataFrame + data : pandas.DataFrame A dataset with columns x and y. Returns @@ -28,9 +28,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics: along with the Pearson correlation coefficient between the two. """ return SummaryStatistics( - df.x.mean(), - df.y.mean(), - df.x.std(), - df.y.std(), - df.corr().x.y, + data.x.mean(), + data.y.mean(), + data.x.std(), + data.y.std(), + data.corr().x.y, ) diff --git a/src/data_morph/morpher.py b/src/data_morph/morpher.py index 67877300..cf945c63 100644 --- a/src/data_morph/morpher.py +++ b/src/data_morph/morpher.py @@ -274,7 +274,7 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool: def _perturb( self, - df: pd.DataFrame, + data: pd.DataFrame, target_shape: Shape, *, shake: Number, @@ -287,7 +287,7 @@ def _perturb( Parameters ---------- - df : pandas.DataFrame + data : pandas.DataFrame The data to perturb. target_shape : Shape The shape to morph the data into. @@ -308,9 +308,8 @@ def _perturb( pandas.DataFrame The input dataset with one point perturbed. """ - row = self._rng.integers(0, len(df)) - initial_x = df.at[row, 'x'] - initial_y = df.at[row, 'y'] + row = self._rng.integers(0, len(data)) + initial_x, initial_y = data.to_numpy()[row] # this is the simulated annealing step, if "do_bad", then we are willing to # accept a new state which is worse than the current one @@ -329,10 +328,10 @@ def _perturb( within_bounds = [new_x, new_y] in bounds done = close_enough and within_bounds - df.loc[row, 'x'] = new_x - df.loc[row, 'y'] = new_y + data.loc[row, 'x'] = new_x + data.loc[row, 'y'] = new_y - return df + return data def morph( self, @@ -434,7 +433,7 @@ def morph( ): raise ValueError('allowed_dist must be a non-negative numeric value.') - morphed_data = start_shape.df.copy() + morphed_data = start_shape.data.copy() # iteration numbers that we will end up writing to file as frames frame_numbers = self._select_frames( @@ -487,7 +486,7 @@ def _tweening( bounds=start_shape.morph_bounds, ) - if self._is_close_enough(start_shape.df, perturbed_data): + if self._is_close_enough(start_shape.data, perturbed_data): morphed_data = perturbed_data frame_number = record_frames( diff --git a/src/data_morph/plotting/static.py b/src/data_morph/plotting/static.py index 6e79ac54..6443f2ff 100644 --- a/src/data_morph/plotting/static.py +++ b/src/data_morph/plotting/static.py @@ -23,7 +23,7 @@ @plot_with_custom_style def plot( - df: pd.DataFrame, + data: pd.DataFrame, x_bounds: Iterable[Number], y_bounds: Iterable[Number], save_to: str | Path, @@ -35,7 +35,7 @@ def plot( Parameters ---------- - df : pandas.DataFrame + data : pandas.DataFrame The dataset to plot. x_bounds, y_bounds : Iterable[numbers.Number] The plotting limits. @@ -57,14 +57,14 @@ def plot( ) fig.get_layout_engine().set(w_pad=1.4, h_pad=0.2, wspace=0) - ax.scatter(df.x, df.y, s=1, alpha=0.7, color='black') + ax.scatter(data.x, data.y, s=1, alpha=0.7, color='black') ax.set(xlim=x_bounds, ylim=y_bounds) tick_formatter = EngFormatter() ax.xaxis.set_major_formatter(tick_formatter) ax.yaxis.set_major_formatter(tick_formatter) - res = get_values(df) + res = get_values(data) labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.') locs = np.linspace(0.8, 0.2, num=len(labels)) diff --git a/src/data_morph/shapes/circles.py b/src/data_morph/shapes/circles.py index f26db83c..496437e0 100644 --- a/src/data_morph/shapes/circles.py +++ b/src/data_morph/shapes/circles.py @@ -41,10 +41,10 @@ class Circle(Shape): """ def __init__(self, dataset: Dataset, radius: Number | None = None) -> None: - self.center: np.ndarray = dataset.df[['x', 'y']].mean().to_numpy() + self.center: np.ndarray = dataset.data[['x', 'y']].mean().to_numpy() """numpy.ndarray: The (x, y) coordinates of the circle's center.""" - self.radius: Number = radius or dataset.df[['x', 'y']].std().mean() * 1.5 + self.radius: Number = radius or dataset.data[['x', 'y']].std().mean() * 1.5 """numbers.Number: The radius of the circle.""" def __repr__(self) -> str: @@ -125,7 +125,7 @@ def __init__(self, dataset: Dataset, num_rings: int = 4) -> None: if num_rings <= 1: raise ValueError('num_rings must be greater than 1') - stdev = dataset.df.std().mean() + stdev = dataset.data.std().mean() self.circles: list[Circle] = [ Circle(dataset, r) for r in np.linspace(stdev / num_rings * 2, stdev * 2, num_rings) diff --git a/src/data_morph/shapes/lines/diamond.py b/src/data_morph/shapes/lines/diamond.py index 9d92c617..5ce50c89 100644 --- a/src/data_morph/shapes/lines/diamond.py +++ b/src/data_morph/shapes/lines/diamond.py @@ -26,8 +26,8 @@ class Diamond(LineCollection): """ def __init__(self, dataset: Dataset) -> None: - xmin, xmax = dataset.df.x.quantile([0.05, 0.95]) - ymin, ymax = dataset.df.y.quantile([0.05, 0.95]) + xmin, xmax = dataset.data.x.quantile([0.05, 0.95]) + ymin, ymax = dataset.data.y.quantile([0.05, 0.95]) xmid = (xmax + xmin) / 2 ymid = (ymax + ymin) / 2 diff --git a/src/data_morph/shapes/lines/rectangle.py b/src/data_morph/shapes/lines/rectangle.py index 7a5d90f5..8f38f43d 100644 --- a/src/data_morph/shapes/lines/rectangle.py +++ b/src/data_morph/shapes/lines/rectangle.py @@ -26,8 +26,8 @@ class Rectangle(LineCollection): """ def __init__(self, dataset: Dataset) -> None: - xmin, xmax = dataset.df.x.quantile([0.1, 0.9]) - ymin, ymax = dataset.df.y.quantile([0.1, 0.9]) + xmin, xmax = dataset.data.x.quantile([0.1, 0.9]) + ymin, ymax = dataset.data.y.quantile([0.1, 0.9]) super().__init__( [[xmin, ymin], [xmin, ymax]], diff --git a/src/data_morph/shapes/points/dots_grid.py b/src/data_morph/shapes/points/dots_grid.py index ddec20ce..b9327c20 100644 --- a/src/data_morph/shapes/points/dots_grid.py +++ b/src/data_morph/shapes/points/dots_grid.py @@ -29,8 +29,8 @@ class DotsGrid(PointCollection): name = 'dots' def __init__(self, dataset: Dataset) -> None: - xlow, xhigh = dataset.df.x.quantile([0.05, 0.95]).tolist() - ylow, yhigh = dataset.df.y.quantile([0.05, 0.95]).tolist() + xlow, xhigh = dataset.data.x.quantile([0.05, 0.95]).tolist() + ylow, yhigh = dataset.data.y.quantile([0.05, 0.95]).tolist() xmid = (xhigh + xlow) / 2 ymid = (yhigh + ylow) / 2 diff --git a/src/data_morph/shapes/points/scatter.py b/src/data_morph/shapes/points/scatter.py index a8f6ab11..e4e0bb0b 100644 --- a/src/data_morph/shapes/points/scatter.py +++ b/src/data_morph/shapes/points/scatter.py @@ -30,9 +30,9 @@ class Scatter(PointCollection): def __init__(self, dataset: Dataset) -> None: rng = np.random.default_rng(1) - center = (dataset.df.x.mean(), dataset.df.y.mean()) + center = (dataset.data.x.mean(), dataset.data.y.mean()) points = [center] - max_radius = max(dataset.df.x.std(), dataset.df.y.std()) + max_radius = max(dataset.data.x.std(), dataset.data.y.std()) for radius in np.linspace(max_radius // 5, max_radius, num=5): for angle in np.linspace(0, 360, num=50, endpoint=False): points.append( diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index 38d6e26c..9a3a2f6e 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -18,17 +18,17 @@ class TestDataset: def test_scale_data(self, scale, starter_shapes_dir): """Confirm that data scaling is working by checking min and max.""" - original_df = pd.read_csv(starter_shapes_dir / 'dino.csv') - original_min = original_df.min() - original_max = original_df.max() + original_data = pd.read_csv(starter_shapes_dir / 'dino.csv') + original_min = original_data.min() + original_max = original_data.max() dataset = DataLoader.load_dataset('dino', scale=scale) if scale: - assert_equal(dataset.df.min().to_numpy(), original_min / scale) - assert_equal(dataset.df.max().to_numpy(), original_max / scale) + assert_equal(dataset.data.min().to_numpy(), original_min / scale) + assert_equal(dataset.data.max().to_numpy(), original_max / scale) else: - assert_frame_equal(dataset.df, original_df) + assert_frame_equal(dataset.data, original_data) @pytest.mark.input_validation @pytest.mark.parametrize( @@ -52,17 +52,17 @@ def test_scale_data_invalid_scale(self, scale): def test_validate_data_missing_columns(self, starter_shapes_dir): """Confirm that creation of a Dataset validates the DataFrame columns.""" - df = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'a'}) + data = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'a'}) with pytest.raises(ValueError, match='Columns "x" and "y" are required.'): - _ = Dataset('dino', df) + _ = Dataset('dino', data) def test_validate_data_fix_column_casing(self, starter_shapes_dir): """Confirm that creating a Dataset with correct names but in wrong casing works.""" - df = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'X'}) - dataset = Dataset('dino', df) - assert not dataset.df[list(dataset._REQUIRED_COLUMNS)].empty + data = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'X'}) + dataset = Dataset('dino', data) + assert not dataset.data[list(dataset._REQUIRED_COLUMNS)].empty @pytest.mark.bounds @pytest.mark.parametrize( diff --git a/tests/data/test_loader.py b/tests/data/test_loader.py index 35ac091e..cc6a0f99 100644 --- a/tests/data/test_loader.py +++ b/tests/data/test_loader.py @@ -27,7 +27,7 @@ def test_load_dataset(self, name, file, starter_shapes_dir): assert isinstance(dataset_from_pkg, Dataset) assert isinstance(dataset_from_file, Dataset) assert dataset_from_pkg.name == dataset_from_file.name - assert_frame_equal(dataset_from_pkg.df, dataset_from_file.df) + assert_frame_equal(dataset_from_pkg.data, dataset_from_file.data) @pytest.mark.input_validation @pytest.mark.parametrize('dataset', ['does_not_exist', 'does_not_exist.csv']) diff --git a/tests/data/test_stats.py b/tests/data/test_stats.py index c99134ed..43fb6fe0 100644 --- a/tests/data/test_stats.py +++ b/tests/data/test_stats.py @@ -7,7 +7,7 @@ def test_stats(): """Test that summary statistics tuple is correct.""" - data = DataLoader.load_dataset('dino').df + data = DataLoader.load_dataset('dino').data stats = get_values(data) diff --git a/tests/plotting/test_animation.py b/tests/plotting/test_animation.py index fd6d0b00..1eb2b79d 100644 --- a/tests/plotting/test_animation.py +++ b/tests/plotting/test_animation.py @@ -15,10 +15,11 @@ def test_frame_stitching(sample_data, tmp_path): start_shape = 'sample' target_shape = 'circle' bounds = [-5, 105] + rng = np.random.default_rng() for frame in range(10): plot( - df=sample_data + np.random.randn(), + data=sample_data + rng.standard_normal(), x_bounds=bounds, y_bounds=bounds, save_to=(tmp_path / f'{start_shape}-to-{target_shape}-{frame}.png'), diff --git a/tests/plotting/test_static.py b/tests/plotting/test_static.py index 6c970d49..d7a203ba 100644 --- a/tests/plotting/test_static.py +++ b/tests/plotting/test_static.py @@ -15,7 +15,7 @@ def test_plot(sample_data, tmp_path, file_path): save_to = tmp_path / 'another-level' / file_path plot( - df=sample_data, + data=sample_data, x_bounds=bounds, y_bounds=bounds, save_to=save_to, @@ -25,7 +25,7 @@ def test_plot(sample_data, tmp_path, file_path): else: ax = plot( - df=sample_data, x_bounds=bounds, y_bounds=bounds, save_to=None, decimals=2 + data=sample_data, x_bounds=bounds, y_bounds=bounds, save_to=None, decimals=2 ) # confirm that the stylesheet was used diff --git a/tests/test_morpher.py b/tests/test_morpher.py index 28aacc23..187dcf83 100644 --- a/tests/test_morpher.py +++ b/tests/test_morpher.py @@ -170,8 +170,8 @@ def test_no_writing(self, capsys): ) with pytest.raises(AssertionError): - assert_frame_equal(morphed_data, dataset.df) - assert morpher._is_close_enough(dataset.df, morphed_data) + assert_frame_equal(morphed_data, dataset.data) + assert morpher._is_close_enough(dataset.data, morphed_data) _, err = capsys.readouterr() assert f'{target_shape} pattern: 100%' in err @@ -254,7 +254,7 @@ def test_freeze_animation_frames( base_path = 'test-freeze' end_frame = morpher._record_frames( - dataset.df, + dataset.data, dataset.plot_bounds, base_path, freeze_for,