-
Notifications
You must be signed in to change notification settings - Fork 11
Open
Description
Am I doing this incorrectly? It doesn't seem that the clustering is computed in the same way.
from scipy.cluster.hierarchy import linkage
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
def get_iris_data():
iris = load_iris()
# Iris dataset
X = pd.DataFrame(iris.data,
index = [*map(lambda x:f"iris_{x}", range(150))],
columns = [*map(lambda x: x.split(" (cm)")[0].replace(" ","_"), iris.feature_names)])
y = pd.Series(iris.target,
index = X.index,
name = "Species")
return X, y
# Get data
X, y = get_iris_data()
# Create an adjacency network
df_adj = np.abs(X.T.corr())
print("Maximum value: {}".format(df_adj.values.ravel().max()), "Minimum value: {}".format(df_adj.values.ravel().min()), sep="\n")
# Distance matrix
df_dism = 1 - df_adj
# Linkage
Z = linkage(df_dism.values, method="ward", optimal_ordering=False)
# ==============
# Python version
# ==============
from dynamicTreeCut import cutreeHybrid
# Clustering
clustering_results = cutreeHybrid(Z, df_dism.values, minClusterSize=20, deepSplit=1)
Se_treecut_from_python = pd.Series(clustering_results["labels"], index=df_dism.index)
Se_treecut_from_python.head()
# Maximum value: 1.0
# Minimum value: 0.35739643082771205
# ..cutHeight not given, setting it to 30.477248373683015 ===> 99% of the (truncated) height range in dendro.
# ..done.
# iris_0 1
# iris_1 1
# iris_2 1
# iris_3 1
# iris_4 1
# dtype: int64
# ==============
# R version
# ==============
from rpy2 import robjects, rinterface
from rpy2.robjects.packages import importr
from rpy2.rinterface import RRuntimeError
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r_dism = pandas2ri.py2ri(df_dism)
fastcluster = importr("fastcluster")
dynamicTreeCut = importr("dynamicTreeCut")
Z = fastcluster.hclust(R["as.dist"](r_dism), method="ward.D2")
treecut_output = dynamicTreeCut.cutreeDynamic(dendro=Z, method="hybrid", distM=r_dism, minClusterSize = 20, deepSplit=1)
Se_treecut_from_R = pd.Series(pandas2ri.ri2py(treecut_output), index=df_dism.index).astype(int)
Se_treecut_from_R.head()
# iris_0 2
# iris_1 2
# iris_2 2
# iris_3 2
# iris_4 2
# dtype: int64Metadata
Metadata
Assignees
Labels
No labels