Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/tutorials/shape-creation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ various bits of information from the dataset, such as the automatically-calculat
bounds (*e.g.*, :attr:`.Dataset.data_bounds`, which form the bounding box of the
starting data, and :attr:`.Dataset.morph_bounds`, which define the limits of where
the algorithm can move the points) or percentiles using the data itself (see
:attr:`.Dataset.df`). For example, the :class:`.XLines` shape inherits from
:attr:`.Dataset.data`). For example, the :class:`.XLines` shape inherits from
:class:`.LineCollection` and uses the morph bounds (:attr:`.Dataset.morph_bounds`)
to calculate its position and scale:

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ lint.select = [
"FA", # flake8-future-annotations
"I", # isort
"N", # pep8-naming
"NPY", # numpy
"PD", # pandas-vet
"PTH", # flake8-use-pathlib
"RUF", # ruff-specific rules
"SIM", # flake8-simplify
Expand Down
26 changes: 14 additions & 12 deletions src/data_morph/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
----------
name : str
The name to use for the dataset.
df : pandas.DataFrame
data : pandas.DataFrame
DataFrame containing columns x and y.
scale : numbers.Number, optional
The factor to scale the data by (can be used to speed up morphing).
Expand All @@ -49,10 +49,12 @@
def __init__(
self,
name: str,
df: pd.DataFrame,
data: pd.DataFrame,
scale: Number | None = None,
) -> None:
self.df: pd.DataFrame = self._validate_data(df).pipe(self._scale_data, scale)
self.data: pd.DataFrame = self._validate_data(data).pipe(
self._scale_data, scale
)
"""pandas.DataFrame: DataFrame containing columns x and y."""

self.name: str = name
Expand Down Expand Up @@ -81,7 +83,7 @@
"""
return BoundingBox(
*[
Interval([self.df[dim].min(), self.df[dim].max()], inclusive=False)
Interval([self.data[dim].min(), self.data[dim].max()], inclusive=False)
for dim in self._REQUIRED_COLUMNS
]
)
Expand Down Expand Up @@ -122,13 +124,13 @@
plot_bounds.align_aspect_ratio()
return plot_bounds

def _scale_data(self, df: pd.DataFrame, scale: Number) -> pd.DataFrame:
def _scale_data(self, data: pd.DataFrame, scale: Number) -> pd.DataFrame:
"""
Apply scaling to the data.

Parameters
----------
df : pandas.DataFrame
data : pandas.DataFrame
The data to scale.
scale : numbers.Number, optional
The factor to scale the data by (can be used to speed up morphing).
Expand All @@ -141,17 +143,17 @@
"""
if scale is None:
self._scaled = False
return df
return data

if isinstance(scale, bool) or not isinstance(scale, Number):
raise TypeError('scale must be a numeric value.')

if not scale:
raise ValueError('scale must be non-zero.')

scaled_df = df.assign(x=df.x.div(scale), y=df.y.div(scale))
scaled_data = data.assign(x=data.x.div(scale), y=data.y.div(scale))
self._scaled = True
return scaled_df
return scaled_data

def _validate_data(self, data: pd.DataFrame) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -211,7 +213,7 @@
fig.get_layout_engine().set(w_pad=0.2, h_pad=0.2)

ax.axis('equal')
ax.scatter(self.df.x, self.df.y, s=2, color='black')
ax.scatter(self.data.x, self.data.y, s=2, color='black')

Check warning on line 216 in src/data_morph/data/dataset.py

View check run for this annotation

Codecov / codecov/patch

src/data_morph/data/dataset.py#L216

Added line #L216 was not covered by tests
ax.set(xlabel='', ylabel='', title=self if title == 'default' else title)

if show_bounds:
Expand All @@ -236,8 +238,8 @@
)
)
ax.text(
(self.df.x.max() + self.df.x.min()) / 2,
self.df.y.max() + self.data_bounds.y_bounds.range / scale_base,
(self.data.x.max() + self.data.x.min()) / 2,
self.data.y.max() + self.data_bounds.y_bounds.range / scale_base,
'DATA BOUNDS',
color='blue',
va='bottom',
Expand Down
10 changes: 5 additions & 5 deletions src/data_morph/data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,18 +106,18 @@ class directly.
Path(cls._DATA_PATH) / cls._DATASETS[dataset]
)
name = dataset
df = pd.read_csv(filepath)
data = pd.read_csv(filepath)
except KeyError:
try:
name = Path(dataset).stem
df = pd.read_csv(dataset)
data = pd.read_csv(dataset)
except FileNotFoundError as err:
raise ValueError(
f'Unknown dataset "{dataset}". '
'Provide a valid path to a CSV dataset or use one of '
f'the included datasets: {", ".join(cls.AVAILABLE_DATASETS)}.'
) from err
return Dataset(name=name, df=df, scale=scale)
return Dataset(name=name, data=data, scale=scale)

@classmethod
@plot_with_custom_style
Expand Down Expand Up @@ -166,7 +166,7 @@ def plot_available_datasets(cls) -> Axes:
elif dataset == 'SDS':
dataset += ' logo'

ax.scatter(points.df.x, points.df.y, s=4, color='black')
ax.scatter(points.data.x, points.data.y, s=4, color='black')

# tight plot bounds for the grid of datasets in the docs
bounds = points.data_bounds.clone()
Expand All @@ -175,7 +175,7 @@ def plot_available_datasets(cls) -> Axes:
bounds.align_aspect_ratio()

ax.set(
title=f'{dataset} ({points.df.shape[0]:,d} points)',
title=f'{dataset} ({points.data.shape[0]:,d} points)',
xlim=bounds.x_bounds,
ylim=bounds.y_bounds,
xlabel='',
Expand Down
14 changes: 7 additions & 7 deletions src/data_morph/data/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
)


def get_values(df: pd.DataFrame) -> SummaryStatistics:
def get_values(data: pd.DataFrame) -> SummaryStatistics:
"""
Calculate the summary statistics for the given set of points.

Parameters
----------
df : pandas.DataFrame
data : pandas.DataFrame
A dataset with columns x and y.

Returns
Expand All @@ -28,9 +28,9 @@ def get_values(df: pd.DataFrame) -> SummaryStatistics:
along with the Pearson correlation coefficient between the two.
"""
return SummaryStatistics(
df.x.mean(),
df.y.mean(),
df.x.std(),
df.y.std(),
df.corr().x.y,
data.x.mean(),
data.y.mean(),
data.x.std(),
data.y.std(),
data.corr().x.y,
)
19 changes: 9 additions & 10 deletions src/data_morph/morpher.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ def _is_close_enough(self, df1: pd.DataFrame, df2: pd.DataFrame) -> bool:

def _perturb(
self,
df: pd.DataFrame,
data: pd.DataFrame,
target_shape: Shape,
*,
shake: Number,
Expand All @@ -287,7 +287,7 @@ def _perturb(

Parameters
----------
df : pandas.DataFrame
data : pandas.DataFrame
The data to perturb.
target_shape : Shape
The shape to morph the data into.
Expand All @@ -308,9 +308,8 @@ def _perturb(
pandas.DataFrame
The input dataset with one point perturbed.
"""
row = self._rng.integers(0, len(df))
initial_x = df.at[row, 'x']
initial_y = df.at[row, 'y']
row = self._rng.integers(0, len(data))
initial_x, initial_y = data.to_numpy()[row]

# this is the simulated annealing step, if "do_bad", then we are willing to
# accept a new state which is worse than the current one
Expand All @@ -329,10 +328,10 @@ def _perturb(
within_bounds = [new_x, new_y] in bounds
done = close_enough and within_bounds

df.loc[row, 'x'] = new_x
df.loc[row, 'y'] = new_y
data.loc[row, 'x'] = new_x
data.loc[row, 'y'] = new_y

return df
return data

def morph(
self,
Expand Down Expand Up @@ -434,7 +433,7 @@ def morph(
):
raise ValueError('allowed_dist must be a non-negative numeric value.')

morphed_data = start_shape.df.copy()
morphed_data = start_shape.data.copy()

# iteration numbers that we will end up writing to file as frames
frame_numbers = self._select_frames(
Expand Down Expand Up @@ -487,7 +486,7 @@ def _tweening(
bounds=start_shape.morph_bounds,
)

if self._is_close_enough(start_shape.df, perturbed_data):
if self._is_close_enough(start_shape.data, perturbed_data):
morphed_data = perturbed_data

frame_number = record_frames(
Expand Down
8 changes: 4 additions & 4 deletions src/data_morph/plotting/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

@plot_with_custom_style
def plot(
df: pd.DataFrame,
data: pd.DataFrame,
x_bounds: Iterable[Number],
y_bounds: Iterable[Number],
save_to: str | Path,
Expand All @@ -35,7 +35,7 @@ def plot(

Parameters
----------
df : pandas.DataFrame
data : pandas.DataFrame
The dataset to plot.
x_bounds, y_bounds : Iterable[numbers.Number]
The plotting limits.
Expand All @@ -57,14 +57,14 @@ def plot(
)
fig.get_layout_engine().set(w_pad=1.4, h_pad=0.2, wspace=0)

ax.scatter(df.x, df.y, s=1, alpha=0.7, color='black')
ax.scatter(data.x, data.y, s=1, alpha=0.7, color='black')
ax.set(xlim=x_bounds, ylim=y_bounds)

tick_formatter = EngFormatter()
ax.xaxis.set_major_formatter(tick_formatter)
ax.yaxis.set_major_formatter(tick_formatter)

res = get_values(df)
res = get_values(data)

labels = ('X Mean', 'Y Mean', 'X SD', 'Y SD', 'Corr.')
locs = np.linspace(0.8, 0.2, num=len(labels))
Expand Down
6 changes: 3 additions & 3 deletions src/data_morph/shapes/circles.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ class Circle(Shape):
"""

def __init__(self, dataset: Dataset, radius: Number | None = None) -> None:
self.center: np.ndarray = dataset.df[['x', 'y']].mean().to_numpy()
self.center: np.ndarray = dataset.data[['x', 'y']].mean().to_numpy()
"""numpy.ndarray: The (x, y) coordinates of the circle's center."""

self.radius: Number = radius or dataset.df[['x', 'y']].std().mean() * 1.5
self.radius: Number = radius or dataset.data[['x', 'y']].std().mean() * 1.5
"""numbers.Number: The radius of the circle."""

def __repr__(self) -> str:
Expand Down Expand Up @@ -125,7 +125,7 @@ def __init__(self, dataset: Dataset, num_rings: int = 4) -> None:
if num_rings <= 1:
raise ValueError('num_rings must be greater than 1')

stdev = dataset.df.std().mean()
stdev = dataset.data.std().mean()
self.circles: list[Circle] = [
Circle(dataset, r)
for r in np.linspace(stdev / num_rings * 2, stdev * 2, num_rings)
Expand Down
4 changes: 2 additions & 2 deletions src/data_morph/shapes/lines/diamond.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class Diamond(LineCollection):
"""

def __init__(self, dataset: Dataset) -> None:
xmin, xmax = dataset.df.x.quantile([0.05, 0.95])
ymin, ymax = dataset.df.y.quantile([0.05, 0.95])
xmin, xmax = dataset.data.x.quantile([0.05, 0.95])
ymin, ymax = dataset.data.y.quantile([0.05, 0.95])

xmid = (xmax + xmin) / 2
ymid = (ymax + ymin) / 2
Expand Down
4 changes: 2 additions & 2 deletions src/data_morph/shapes/lines/rectangle.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ class Rectangle(LineCollection):
"""

def __init__(self, dataset: Dataset) -> None:
xmin, xmax = dataset.df.x.quantile([0.1, 0.9])
ymin, ymax = dataset.df.y.quantile([0.1, 0.9])
xmin, xmax = dataset.data.x.quantile([0.1, 0.9])
ymin, ymax = dataset.data.y.quantile([0.1, 0.9])

super().__init__(
[[xmin, ymin], [xmin, ymax]],
Expand Down
4 changes: 2 additions & 2 deletions src/data_morph/shapes/points/dots_grid.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class DotsGrid(PointCollection):
name = 'dots'

def __init__(self, dataset: Dataset) -> None:
xlow, xhigh = dataset.df.x.quantile([0.05, 0.95]).tolist()
ylow, yhigh = dataset.df.y.quantile([0.05, 0.95]).tolist()
xlow, xhigh = dataset.data.x.quantile([0.05, 0.95]).tolist()
ylow, yhigh = dataset.data.y.quantile([0.05, 0.95]).tolist()

xmid = (xhigh + xlow) / 2
ymid = (yhigh + ylow) / 2
Expand Down
4 changes: 2 additions & 2 deletions src/data_morph/shapes/points/scatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ class Scatter(PointCollection):

def __init__(self, dataset: Dataset) -> None:
rng = np.random.default_rng(1)
center = (dataset.df.x.mean(), dataset.df.y.mean())
center = (dataset.data.x.mean(), dataset.data.y.mean())
points = [center]
max_radius = max(dataset.df.x.std(), dataset.df.y.std())
max_radius = max(dataset.data.x.std(), dataset.data.y.std())
for radius in np.linspace(max_radius // 5, max_radius, num=5):
for angle in np.linspace(0, 360, num=50, endpoint=False):
points.append(
Expand Down
22 changes: 11 additions & 11 deletions tests/data/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ class TestDataset:
def test_scale_data(self, scale, starter_shapes_dir):
"""Confirm that data scaling is working by checking min and max."""

original_df = pd.read_csv(starter_shapes_dir / 'dino.csv')
original_min = original_df.min()
original_max = original_df.max()
original_data = pd.read_csv(starter_shapes_dir / 'dino.csv')
original_min = original_data.min()
original_max = original_data.max()

dataset = DataLoader.load_dataset('dino', scale=scale)

if scale:
assert_equal(dataset.df.min().to_numpy(), original_min / scale)
assert_equal(dataset.df.max().to_numpy(), original_max / scale)
assert_equal(dataset.data.min().to_numpy(), original_min / scale)
assert_equal(dataset.data.max().to_numpy(), original_max / scale)
else:
assert_frame_equal(dataset.df, original_df)
assert_frame_equal(dataset.data, original_data)

@pytest.mark.input_validation
@pytest.mark.parametrize(
Expand All @@ -52,17 +52,17 @@ def test_scale_data_invalid_scale(self, scale):
def test_validate_data_missing_columns(self, starter_shapes_dir):
"""Confirm that creation of a Dataset validates the DataFrame columns."""

df = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'a'})
data = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'a'})

with pytest.raises(ValueError, match='Columns "x" and "y" are required.'):
_ = Dataset('dino', df)
_ = Dataset('dino', data)

def test_validate_data_fix_column_casing(self, starter_shapes_dir):
"""Confirm that creating a Dataset with correct names but in wrong casing works."""

df = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'X'})
dataset = Dataset('dino', df)
assert not dataset.df[list(dataset._REQUIRED_COLUMNS)].empty
data = pd.read_csv(starter_shapes_dir / 'dino.csv').rename(columns={'x': 'X'})
dataset = Dataset('dino', data)
assert not dataset.data[list(dataset._REQUIRED_COLUMNS)].empty

@pytest.mark.bounds
@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_load_dataset(self, name, file, starter_shapes_dir):
assert isinstance(dataset_from_pkg, Dataset)
assert isinstance(dataset_from_file, Dataset)
assert dataset_from_pkg.name == dataset_from_file.name
assert_frame_equal(dataset_from_pkg.df, dataset_from_file.df)
assert_frame_equal(dataset_from_pkg.data, dataset_from_file.data)

@pytest.mark.input_validation
@pytest.mark.parametrize('dataset', ['does_not_exist', 'does_not_exist.csv'])
Expand Down
Loading