#K-means requires that you specify the number of clusters in advance, which is always a fun chicken and egg problem. #To solve it we use the within-clusters sum of squares method: # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var)) for (i in 2:12) { wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss) } plot(wss, pch = 19, ylab = 'Within-clusters sum of squares', xlab = 'K') # Optimal number of clusters # https://statsandr.com/blog/clustering-analysis-k-means-and-hierarchical-clustering-by-hand-and-in-r/ # Install factoextra package if needed if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org") # Load factoextra and NbClust libraries library(factoextra) library(NbClust) # Elbow method fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation labs(subtitle = "Elbow method") # add subtitle # Silhouette method fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") + labs(subtitle = "Silhouette method") # Gap statistic set.seed(5) fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, nstart = 25, method = "gap_stat", nboot = 500 # reduce it for lower computation time (but less precise results) ) + labs(subtitle = "Gap statistic method") nbclust_out <- NbClust( data = data.frame(som_model$codes[[1]]), distance = "euclidean", min.nc = 2, # minimum number of clusters max.nc = 12, # maximum number of clusters method = "kmeans" # one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans" ) ## *** : The Hubert index is a graphical method of determining the number of clusters. ## In the plot of Hubert index, we seek a significant knee that corresponds to a ## significant increase of the value of the measure i.e the significant peak in Hubert ## index second differences plot. ## ## *** : The D index is a graphical method of determining the number of clusters. ## In the plot of D index, we seek a significant knee (the significant peak in Dindex ## second differences plot) that corresponds to a significant increase of the value of ## the measure. ## # create a dataframe of the optimal number of clusters nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ]) # select only indices which select between 2 and 12 clusters nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 12) # create plot ggplot(nbclust_plot) + aes(x = clusters) + geom_histogram(bins = 30L, fill = "#0c4c8a") + labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") + theme_minimal() # Based on exploration above, set number of clusers here. # k <- nbclust_plot$clusters[which.max(nbclust_plot$???)] k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))]) k # K-Means Clustering # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k) plot(som_model, main = '', type = "property", property = som.cluster$cluster, palette.name = topo.colors) add.cluster.boundaries(som_model, som.cluster$cluster) # Hierarchical Clustering # https://www.datacamp.com/community/tutorials/hierarchical-clustering-R som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single") plot(som_model, main = '', type = "property", property = som.cluster$order, palette.name = topo.colors) add.cluster.boundaries(som_model, som.cluster$order) # Hierarchical K-Means Clustering: Optimize Clusters # https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/ # Install factoextra package if needed if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org") # Load factoextra library library(factoextra) res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k) # Elements returned by hkmeans() names(res.hk) # Print the results str(res.hk) # Visualize the tree fviz_dend(res.hk, cex = 0.6, palette = "jco", rect = TRUE, rect_border = "jco", rect_fill = TRUE) # Visualize the hkmeans final clusters fviz_cluster(res.hk, palette = "jco", repel = TRUE, ggtheme = theme_classic()) plot(som_model, main = '', type = "property", property = res.hk$hclust$order, palette.name = topo.colors) add.cluster.boundaries(som_model, res.hk$hclust$order)