@@ -28,69 +28,80 @@ def durationRepresentation(
2828 :type representMinMax: bool
2929 """
3030
31- # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
32- # the time steps inside the candidates.
33- columnTuples = []
34- for i in range (int (candidates .shape [1 ] / timeStepsPerPeriod )):
35- for j in range (timeStepsPerPeriod ):
36- columnTuples .append ((i , j ))
37- candidates = pd .DataFrame (
38- candidates , columns = pd .MultiIndex .from_tuples (columnTuples )
31+ # Convert candidates to numpy array at the beginning if it's a DataFrame
32+ if isinstance (candidates , pd .DataFrame ):
33+ candidates_array = candidates .values
34+ else :
35+ candidates_array = candidates
36+
37+ # Create a pandas DataFrame only when necessary
38+ columnTuples = [(i , j ) for i in range (int (candidates_array .shape [1 ] / timeStepsPerPeriod ))
39+ for j in range (timeStepsPerPeriod )]
40+
41+ candidates_df = pd .DataFrame (
42+ candidates_array , columns = pd .MultiIndex .from_tuples (columnTuples )
3943 )
40-
41- # There are two options for the duration representation. Either, the distribution of each cluster is preserved
42- # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
43- # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
44+
4445 if distributionPeriodWise :
4546 clusterCenters = []
46- for clusterNum in np .unique (clusterOrder ):
47- indice = np .where (clusterOrder == clusterNum )
48- noCandidates = len (indice [0 ])
49- clean_index = []
50-
51- clusterCenter = []
52- # get a clean index depending on the size
53- for y in candidates .columns .levels [1 ]:
54- for x in range (noCandidates ):
55- clean_index .append ((x , y ))
56- for a in candidates .columns .levels [0 ]:
57- # get all the values of a certain attribute and cluster
58- candidateValues = candidates .loc [indice [0 ], a ]
59- # sort all values
60- sortedAttr = candidateValues .stack (
61- future_stack = True ,
62- ).sort_values ()
63- # reindex and arrange such that every sorted segment gets represented by its mean
64- sortedAttr .index = pd .MultiIndex .from_tuples (clean_index )
65- representationValues = sortedAttr .unstack (level = 0 ).mean (axis = 1 )
66- # respect max and min of the attributes
47+ unique_clusters = np .unique (clusterOrder )
48+
49+ for clusterNum in unique_clusters :
50+ indice = np .where (clusterOrder == clusterNum )[0 ]
51+ noCandidates = len (indice )
52+
53+ # Pre-allocate the full cluster center array
54+ cluster_values_count = noCandidates * timeStepsPerPeriod * len (candidates_df .columns .levels [0 ])
55+ clusterCenter = np .zeros (cluster_values_count )
56+ current_idx = 0
57+
58+ for a in candidates_df .columns .levels [0 ]:
59+ # Get values using numpy indexing when possible
60+ candidateValues = candidates_df .loc [indice , a ].values
61+
62+ # Reshape to more easily work with numpy
63+ candidateValues_reshaped = candidateValues .reshape (- 1 )
64+
65+ # Sort values using numpy
66+ sorted_values = np .sort (candidateValues_reshaped )
67+
68+ # Calculate representative values directly
69+ values_per_timestep = noCandidates
70+ representation_values = np .zeros (timeStepsPerPeriod )
71+
72+ for t in range (timeStepsPerPeriod ):
73+ start_idx = t * values_per_timestep
74+ end_idx = start_idx + values_per_timestep
75+ representation_values [t ] = np .mean (sorted_values [start_idx :end_idx ])
76+
77+ # Handle min/max representation if needed
6778 if representMinMax :
68- representationValues . loc [0 ] = sortedAttr . values [0 ]
69- representationValues . loc [ representationValues . index [ - 1 ]] = (
70- sortedAttr . values [ - 1 ]
71- )
72-
73- # get the order of the representation values such that euclidean distance to the candidates is minimized
74- order = candidateValues . mean (). sort_values (). index
75- # arrange
76- representationValues . index = order
77- representationValues . sort_index ( inplace = True )
78-
79- # add to cluster center
80- clusterCenter = np . append ( clusterCenter , representationValues . values )
81-
82- clusterCenters .append (clusterCenter )
83-
79+ representation_values [0 ] = sorted_values [0 ]
80+ representation_values [ - 1 ] = sorted_values [ - 1 ]
81+
82+ # Re-order values based on the mean of candidate values
83+ mean_values = np . mean ( candidateValues , axis = 0 )
84+ order_indices = np . argsort ( mean_values )
85+
86+ # Reorder representation values
87+ representation_values_ordered = representation_values [ order_indices ]
88+
89+ # Add to cluster center
90+ clusterCenter [ current_idx : current_idx + len ( representation_values )] = representation_values_ordered
91+ current_idx += len ( representation_values )
92+
93+ clusterCenters .append (clusterCenter [: current_idx ]) # Trim if we didn't use the whole pre-allocation
94+
8495 else :
8596 clusterCentersList = []
86- for a in candidates .columns .levels [0 ]:
97+ for a in candidates_df .columns .levels [0 ]:
8798 meanVals = []
8899 clusterLengths = []
89100 for clusterNum in np .unique (clusterOrder ):
90101 indice = np .where (clusterOrder == clusterNum )
91102 noCandidates = len (indice [0 ])
92103 # get all the values of a certain attribute and cluster
93- candidateValues = candidates .loc [indice [0 ], a ]
104+ candidateValues = candidates_df .loc [indice [0 ], a ]
94105 # calculate centroid of each cluster and append to list
95106 meanVals .append (candidateValues .mean ())
96107 # make a list of weights of each cluster for each time step within the period
@@ -113,7 +124,7 @@ def durationRepresentation(
113124 order = meansAndWeightsSorted .index
114125 # sort all values of the original time series
115126 sortedAttr = (
116- candidates .loc [:, a ]
127+ candidates_df .loc [:, a ]
117128 .stack (
118129 future_stack = True ,
119130 )
0 commit comments