-
I have an implementation of Mapper but for some reason I keep getting the error "Error occurred while applying the Mapper algorithm: Visualize requires a mapper with more than 0 nodes." I've changed my clustering parameters without attaining joy. I've tested with pretty large synthetic data just to be sure I had enough, also without reaching joy. i'll share the relevant functions. My apologies if the reason is obvious; I am new to TDA. def generate_synthetic_histograms(num_samples, num_bins, max_value=5000):
"""
Generate synthetic histogram data for testing.
Args:
num_samples (int): Number of histogram samples to generate.
num_bins (int): Number of bins in each histogram.
max_value (float): Maximum value for the histogram bins.
Returns:
list: Generated synthetic histogram data.
"""
synthetic_histograms = np.random.rand(num_samples, num_bins).astype(np.float32) * max_value
return synthetic_histograms.tolist() def dynamically_standardize_data(input_data, scaler_method='auto'):
"""
Dynamically choose a scikit-learn standardization method based on statistical
properties of the data.
Args:
input_data (np.ndarray): Input data array.
scaler_method (str): Scaling method for standardizing the data.
Returns:
np.ndarray: Standardized data.
"""
# Assess sparsity
sparsity_assessment, sparsity_ratio = assess_sparsity(input_data, threshold=0.5)
if sparsity_assessment == "sparse":
scaler = MaxAbsScaler()
else:
if scaler_method == 'auto':
data_variance = np.var(input_data, axis=0)
data_skewness = skew(input_data, axis=0)
data_kurtosis = kurtosis(input_data, axis=0)
variance_threshold = np.percentile(data_variance, 75)
if (np.any(data_variance > variance_threshold) or
np.max(data_skewness) > 1.0 or
np.max(data_kurtosis) > 3.0):
scaler = RobustScaler()
else:
scaler = StandardScaler()
elif scaler_method == 'standard':
scaler = StandardScaler()
elif scaler_method == 'robust':
scaler = RobustScaler()
elif scaler_method == 'minmax':
scaler = MinMaxScaler()
elif scaler_method == 'maxabs':
scaler = MaxAbsScaler()
elif scaler_method == 'quantile':
scaler = QuantileTransformer(output_distribution='normal') # or 'uniform'
else:
raise ValueError(f"Unknown scaler method: {scaler_method}")
standardized_data = scaler.fit_transform(input_data)
return standardized_data, sparsity_assessment def assess_sparsity(data, threshold):
"""
Assess if the data is sparse or dense.
Args:
data (list or np.ndarray): Input data.
threshold (float): Cutoff point for separating dense from sparse distributions.
Returns:
tuple:
assessment (str): "sparse" or "dense"
sparsity_ratio (float): Elements with values of zero or NaN, divided by
total elements.
"""
total_elements = data.size
zero_or_nan_elements = np.count_nonzero((data == 0) | np.isnan(data))
sparsity_ratio = zero_or_nan_elements / total_elements
return ("sparse" if sparsity_ratio > threshold else "dense"), sparsity_ratio def apply_mapper_algorithm(input_data, input_type, scaler_method='auto', n_cubes=10,
perc_overlap=0.5, min_samples=None):
"""
Apply the Mapper algorithm on the given input data and save the resulting
visualization.
Args:
input_data (np.ndarray): Input data array.
input_type (str): Type of input data ("images", "images_npy", "histograms",
"features", "features_fisher_vectors",
"histograms_fisher_vectors").
scaler_method (str): Scaling method for standardizing the data.
n_cubes (int, optional): Number of hypercubes to create in the cover. Default is 10.
perc_overlap (float, optional): Amount of overlap between adjacent cubes calculated
only along 1 dimension. Default is 0.5.
min_samples (int, optional): The minimum size of clusters for HDBSCAN. Default is None.
Returns:
str: Generated HTML containing the Mapper visualization.
Raises:
Exception: If an error occurs during the application of the Mapper algorithm.
"""
try:
# Verify input data
if len(input_data) == 0:
raise ValueError("Input data is empty")
mapper = km.KeplerMapper(verbose=2) # set to 0 to turn off logging; 2 for complete
# Adjust cover parameters
cov = Cover(n_cubes=n_cubes, perc_overlap=perc_overlap)
all_data = prepare_all_data(input_type, input_data)
if all_data is None or len(all_data) == 0:
raise ValueError("Processed data is empty or None")
all_data_std, sparsity_assessment = dynamically_standardize_data(all_data, scaler_method)
print(f"Sparsity assessment: {sparsity_assessment}")
# Intermediate dimensionality reduction for sparse data
if sparsity_assessment == "sparse":
intermediate_components = min(50, all_data_std.shape[0] - 1) # Ensure valid number of components
intermediate_svd = TruncatedSVD(n_components=intermediate_components)
all_data_std = intermediate_svd.fit_transform(all_data_std)
# Creates the projection/lens from the data
mapper_lens = mapper.project(
all_data_std, projection=TSNE(),
scaler = None, # scaling has already been done
)
# Apply the Mapper algorithm, with HDBSCAN clustering
mapper_graph = mapper.map(
mapper_lens,
all_data,
cover=cov,
clusterer=hdbscan.HDBSCAN(
min_samples=None, min_cluster_size=3, cluster_selection_epsilon=0.0,
cluster_selection_method='eom'
)
)
# Produce the visualization markup code.
mapper_html = mapper.visualize(
mapper_graph,
lens=mapper_lens,
include_searchbar=False,
save_file=False
)
return mapper_html
except Exception as e:
print(f"Error occurred while applying the Mapper algorithm: {e}", file=sys.stderr)
return None For testing, I am using this command to generate synthetic data of image histograms:
and then i pass that data forward. I have tried 5000 x 256 as well, without any nodes being generated. do you see the issue? thank you in advance for your time and attention.
|
Beta Was this translation helpful? Give feedback.
Replies: 1 comment 2 replies
-
Oh dear. User error (me in other words). Embarrassing! either way, if anyone has any comments about the sanity of the code, I am all ears! |
Beta Was this translation helpful? Give feedback.
I appreciate your prompting me! In this case, it was simply that I was not using the proper testing PHP document, so there's really nothing to share.
I will say though that in further testing I did find that Mapper needs around 150 histograms (150, 256) to generate any nodes. I figure this is as expected? I certainly have tried a wide variety of parameters.