Skip to content

Commit 4458dbf

Browse files
authored
Fix long calculation times when using duration representation. (#98)
* Fix long calculation times when using duration representation. Now using numpy operations where possible and minimizing DataFrame manipulations
1 parent c26b8a1 commit 4458dbf

File tree

4 files changed

+68
-57
lines changed

4 files changed

+68
-57
lines changed

.github/workflows/daily_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
strategy:
2222
fail-fast: false
2323
matrix:
24-
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2019"]
24+
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2022"]
2525
# os: ["ubuntu-latest"]
2626
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13"]
2727

.github/workflows/test_on_push_and_pull.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ jobs:
1717
strategy:
1818
fail-fast: false
1919
matrix:
20-
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2019"]
20+
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2022"]
2121
python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13"]
2222

2323
steps:
@@ -43,7 +43,7 @@ jobs:
4343
strategy:
4444
fail-fast: false
4545
matrix:
46-
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2019"]
46+
os: ["ubuntu-latest","ubuntu-22.04", "macos-latest","macos-13", "windows-latest","windows-2022"]
4747
python-numpy-version: [ {python : 3.9,numpy : 1.25}, {python : 3.9,numpy : 1.26},{python : 3.9,numpy : 2.0}]
4848
steps:
4949
- uses: actions/checkout@v2
@@ -60,4 +60,4 @@ jobs:
6060
working-directory: ./test/
6161
run: |
6262
pytest
63-
codecov
63+
codecov

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
55

66
[project]
77
name = "tsam"
8-
version = "2.3.7"
8+
version = "2.3.8"
99
description = "Time series aggregation module (tsam) to create typical periods"
1010
authors = [
1111
{ name = "Leander Kotzur", email = "leander.kotzur@googlemail.com" },

src/tsam/utils/durationRepresentation.py

Lines changed: 63 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -28,69 +28,80 @@ def durationRepresentation(
2828
:type representMinMax: bool
2929
"""
3030

31-
# make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
32-
# the time steps inside the candidates.
33-
columnTuples = []
34-
for i in range(int(candidates.shape[1] / timeStepsPerPeriod)):
35-
for j in range(timeStepsPerPeriod):
36-
columnTuples.append((i, j))
37-
candidates = pd.DataFrame(
38-
candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
31+
# Convert candidates to numpy array at the beginning if it's a DataFrame
32+
if isinstance(candidates, pd.DataFrame):
33+
candidates_array = candidates.values
34+
else:
35+
candidates_array = candidates
36+
37+
# Create a pandas DataFrame only when necessary
38+
columnTuples = [(i, j) for i in range(int(candidates_array.shape[1] / timeStepsPerPeriod))
39+
for j in range(timeStepsPerPeriod)]
40+
41+
candidates_df = pd.DataFrame(
42+
candidates_array, columns=pd.MultiIndex.from_tuples(columnTuples)
3943
)
40-
41-
# There are two options for the duration representation. Either, the distribution of each cluster is preserved
42-
# (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
43-
# inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
44+
4445
if distributionPeriodWise:
4546
clusterCenters = []
46-
for clusterNum in np.unique(clusterOrder):
47-
indice = np.where(clusterOrder == clusterNum)
48-
noCandidates = len(indice[0])
49-
clean_index = []
50-
51-
clusterCenter = []
52-
# get a clean index depending on the size
53-
for y in candidates.columns.levels[1]:
54-
for x in range(noCandidates):
55-
clean_index.append((x, y))
56-
for a in candidates.columns.levels[0]:
57-
# get all the values of a certain attribute and cluster
58-
candidateValues = candidates.loc[indice[0], a]
59-
# sort all values
60-
sortedAttr = candidateValues.stack(
61-
future_stack=True,
62-
).sort_values()
63-
# reindex and arrange such that every sorted segment gets represented by its mean
64-
sortedAttr.index = pd.MultiIndex.from_tuples(clean_index)
65-
representationValues = sortedAttr.unstack(level=0).mean(axis=1)
66-
# respect max and min of the attributes
47+
unique_clusters = np.unique(clusterOrder)
48+
49+
for clusterNum in unique_clusters:
50+
indice = np.where(clusterOrder == clusterNum)[0]
51+
noCandidates = len(indice)
52+
53+
# Pre-allocate the full cluster center array
54+
cluster_values_count = noCandidates * timeStepsPerPeriod * len(candidates_df.columns.levels[0])
55+
clusterCenter = np.zeros(cluster_values_count)
56+
current_idx = 0
57+
58+
for a in candidates_df.columns.levels[0]:
59+
# Get values using numpy indexing when possible
60+
candidateValues = candidates_df.loc[indice, a].values
61+
62+
# Reshape to more easily work with numpy
63+
candidateValues_reshaped = candidateValues.reshape(-1)
64+
65+
# Sort values using numpy
66+
sorted_values = np.sort(candidateValues_reshaped)
67+
68+
# Calculate representative values directly
69+
values_per_timestep = noCandidates
70+
representation_values = np.zeros(timeStepsPerPeriod)
71+
72+
for t in range(timeStepsPerPeriod):
73+
start_idx = t * values_per_timestep
74+
end_idx = start_idx + values_per_timestep
75+
representation_values[t] = np.mean(sorted_values[start_idx:end_idx])
76+
77+
# Handle min/max representation if needed
6778
if representMinMax:
68-
representationValues.loc[0] = sortedAttr.values[0]
69-
representationValues.loc[representationValues.index[-1]] = (
70-
sortedAttr.values[-1]
71-
)
72-
73-
# get the order of the representation values such that euclidean distance to the candidates is minimized
74-
order = candidateValues.mean().sort_values().index
75-
# arrange
76-
representationValues.index = order
77-
representationValues.sort_index(inplace=True)
78-
79-
# add to cluster center
80-
clusterCenter = np.append(clusterCenter, representationValues.values)
81-
82-
clusterCenters.append(clusterCenter)
83-
79+
representation_values[0] = sorted_values[0]
80+
representation_values[-1] = sorted_values[-1]
81+
82+
# Re-order values based on the mean of candidate values
83+
mean_values = np.mean(candidateValues, axis=0)
84+
order_indices = np.argsort(mean_values)
85+
86+
# Reorder representation values
87+
representation_values_ordered = representation_values[order_indices]
88+
89+
# Add to cluster center
90+
clusterCenter[current_idx:current_idx+len(representation_values)] = representation_values_ordered
91+
current_idx += len(representation_values)
92+
93+
clusterCenters.append(clusterCenter[:current_idx]) # Trim if we didn't use the whole pre-allocation
94+
8495
else:
8596
clusterCentersList = []
86-
for a in candidates.columns.levels[0]:
97+
for a in candidates_df.columns.levels[0]:
8798
meanVals = []
8899
clusterLengths = []
89100
for clusterNum in np.unique(clusterOrder):
90101
indice = np.where(clusterOrder == clusterNum)
91102
noCandidates = len(indice[0])
92103
# get all the values of a certain attribute and cluster
93-
candidateValues = candidates.loc[indice[0], a]
104+
candidateValues = candidates_df.loc[indice[0], a]
94105
# calculate centroid of each cluster and append to list
95106
meanVals.append(candidateValues.mean())
96107
# make a list of weights of each cluster for each time step within the period
@@ -113,7 +124,7 @@ def durationRepresentation(
113124
order = meansAndWeightsSorted.index
114125
# sort all values of the original time series
115126
sortedAttr = (
116-
candidates.loc[:, a]
127+
candidates_df.loc[:, a]
117128
.stack(
118129
future_stack=True,
119130
)

0 commit comments

Comments
 (0)