141 lines
4.7 KiB
R
141 lines
4.7 KiB
R
|
|
#K-means requires that you specify the number of clusters in advance, which is always a fun chicken and egg problem.
|
|
#To solve it we use the within-clusters sum of squares method:
|
|
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
|
|
|
wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var))
|
|
for (i in 2:12) {
|
|
wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss)
|
|
}
|
|
|
|
plot(wss,
|
|
pch = 19,
|
|
ylab = 'Within-clusters sum of squares',
|
|
xlab = 'K')
|
|
|
|
# Optimal number of clusters
|
|
# https://statsandr.com/blog/clustering-analysis-k-means-and-hierarchical-clustering-by-hand-and-in-r/
|
|
|
|
|
|
# Install factoextra package if needed
|
|
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
|
|
|
|
# Load factoextra and NbClust libraries
|
|
library(factoextra)
|
|
library(NbClust)
|
|
|
|
# Elbow method
|
|
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") +
|
|
geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation
|
|
labs(subtitle = "Elbow method") # add subtitle
|
|
|
|
# Silhouette method
|
|
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") +
|
|
labs(subtitle = "Silhouette method")
|
|
|
|
# Gap statistic
|
|
set.seed(5)
|
|
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans,
|
|
nstart = 25,
|
|
method = "gap_stat",
|
|
nboot = 500 # reduce it for lower computation time (but less precise results)
|
|
) +
|
|
labs(subtitle = "Gap statistic method")
|
|
|
|
nbclust_out <- NbClust(
|
|
data = data.frame(som_model$codes[[1]]),
|
|
distance = "euclidean",
|
|
min.nc = 2, # minimum number of clusters
|
|
max.nc = 12, # maximum number of clusters
|
|
method = "kmeans" # one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans"
|
|
)
|
|
|
|
## *** : The Hubert index is a graphical method of determining the number of clusters.
|
|
## In the plot of Hubert index, we seek a significant knee that corresponds to a
|
|
## significant increase of the value of the measure i.e the significant peak in Hubert
|
|
## index second differences plot.
|
|
##
|
|
|
|
## *** : The D index is a graphical method of determining the number of clusters.
|
|
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
|
|
## second differences plot) that corresponds to a significant increase of the value of
|
|
## the measure.
|
|
##
|
|
|
|
|
|
# create a dataframe of the optimal number of clusters
|
|
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
|
|
|
|
# select only indices which select between 2 and 12 clusters
|
|
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 12)
|
|
|
|
# create plot
|
|
ggplot(nbclust_plot) +
|
|
aes(x = clusters) +
|
|
geom_histogram(bins = 30L, fill = "#0c4c8a") +
|
|
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
|
|
theme_minimal()
|
|
|
|
# Based on exploration above, set number of clusers here.
|
|
# k <- nbclust_plot$clusters[which.max(nbclust_plot$???)]
|
|
k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))])
|
|
k
|
|
|
|
# K-Means Clustering
|
|
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
|
|
|
som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k)
|
|
|
|
plot(som_model,
|
|
main = '',
|
|
type = "property",
|
|
property = som.cluster$cluster,
|
|
palette.name = topo.colors)
|
|
add.cluster.boundaries(som_model, som.cluster$cluster)
|
|
|
|
# Hierarchical Clustering
|
|
# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
|
|
|
|
som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single")
|
|
|
|
plot(som_model,
|
|
main = '',
|
|
type = "property",
|
|
property = som.cluster$order,
|
|
palette.name = topo.colors)
|
|
|
|
add.cluster.boundaries(som_model, som.cluster$order)
|
|
|
|
# Hierarchical K-Means Clustering: Optimize Clusters
|
|
# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/
|
|
|
|
# Install factoextra package if needed
|
|
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
|
|
|
|
# Load factoextra library
|
|
library(factoextra)
|
|
|
|
res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k)
|
|
|
|
# Elements returned by hkmeans()
|
|
names(res.hk)
|
|
|
|
# Print the results
|
|
str(res.hk)
|
|
|
|
# Visualize the tree
|
|
fviz_dend(res.hk, cex = 0.6, palette = "jco",
|
|
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
|
|
|
|
# Visualize the hkmeans final clusters
|
|
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
|
|
ggtheme = theme_classic())
|
|
|
|
plot(som_model,
|
|
main = '',
|
|
type = "property",
|
|
property = res.hk$hclust$order,
|
|
palette.name = topo.colors)
|
|
|
|
add.cluster.boundaries(som_model, res.hk$hclust$order)
|