Skip to content

R implementation is not consistent with Python version? #9

@jolespin

Description

@jolespin

Am I doing this incorrectly? It doesn't seem that the clustering is computed in the same way.

from scipy.cluster.hierarchy import linkage
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris

def get_iris_data():
    iris = load_iris()
    # Iris dataset
    X = pd.DataFrame(iris.data,
                     index = [*map(lambda x:f"iris_{x}", range(150))],
                     columns = [*map(lambda x: x.split(" (cm)")[0].replace(" ","_"), iris.feature_names)])

    y = pd.Series(iris.target,
                           index = X.index,
                           name = "Species")
    return X, y

# Get data
X, y = get_iris_data()

# Create an adjacency network
df_adj = np.abs(X.T.corr())

print("Maximum value: {}".format(df_adj.values.ravel().max()), "Minimum value: {}".format(df_adj.values.ravel().min()), sep="\n")

# Distance matrix
df_dism = 1 - df_adj

# Linkage 
Z = linkage(df_dism.values, method="ward", optimal_ordering=False)

# ==============
# Python version
# ==============
from dynamicTreeCut import cutreeHybrid

# Clustering
clustering_results = cutreeHybrid(Z, df_dism.values, minClusterSize=20, deepSplit=1)

Se_treecut_from_python = pd.Series(clustering_results["labels"], index=df_dism.index)
Se_treecut_from_python.head()
# Maximum value: 1.0
# Minimum value: 0.35739643082771205
# ..cutHeight not given, setting it to 30.477248373683015  ===>  99% of the (truncated) height range in dendro.
# ..done.
# iris_0    1
# iris_1    1
# iris_2    1
# iris_3    1
# iris_4    1
# dtype: int64

# ==============
# R version
# ==============
from rpy2 import robjects, rinterface
from rpy2.robjects.packages import importr
from rpy2.rinterface import RRuntimeError
from rpy2.robjects import pandas2ri
pandas2ri.activate()


r_dism = pandas2ri.py2ri(df_dism)

fastcluster = importr("fastcluster")
dynamicTreeCut = importr("dynamicTreeCut")


Z = fastcluster.hclust(R["as.dist"](r_dism), method="ward.D2")
treecut_output = dynamicTreeCut.cutreeDynamic(dendro=Z, method="hybrid", distM=r_dism, minClusterSize = 20, deepSplit=1)
Se_treecut_from_R = pd.Series(pandas2ri.ri2py(treecut_output), index=df_dism.index).astype(int)

Se_treecut_from_R.head()
# iris_0    2
# iris_1    2
# iris_2    2
# iris_3    2
# iris_4    2
# dtype: int64

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions