ransomware/scratch/Clusters.R

141 lines
4.7 KiB
R

#K-means requires that you specify the number of clusters in advance, which is always a fun chicken and egg problem.
#To solve it we use the within-clusters sum of squares method:
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var))
for (i in 2:12) {
wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss)
}
plot(wss,
pch = 19,
ylab = 'Within-clusters sum of squares',
xlab = 'K')
# Optimal number of clusters
# https://statsandr.com/blog/clustering-analysis-k-means-and-hierarchical-clustering-by-hand-and-in-r/
# Install factoextra package if needed
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
# Load factoextra and NbClust libraries
library(factoextra)
library(NbClust)
# Elbow method
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation
labs(subtitle = "Elbow method") # add subtitle
# Silhouette method
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
# Gap statistic
set.seed(5)
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans,
nstart = 25,
method = "gap_stat",
nboot = 500 # reduce it for lower computation time (but less precise results)
) +
labs(subtitle = "Gap statistic method")
nbclust_out <- NbClust(
data = data.frame(som_model$codes[[1]]),
distance = "euclidean",
min.nc = 2, # minimum number of clusters
max.nc = 12, # maximum number of clusters
method = "kmeans" # one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans"
)
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])
# select only indices which select between 2 and 12 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 12)
# create plot
ggplot(nbclust_plot) +
aes(x = clusters) +
geom_histogram(bins = 30L, fill = "#0c4c8a") +
labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
theme_minimal()
# Based on exploration above, set number of clusers here.
# k <- nbclust_plot$clusters[which.max(nbclust_plot$???)]
k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))])
k
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k)
plot(som_model,
main = '',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model, som.cluster$cluster)
# Hierarchical Clustering
# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single")
plot(som_model,
main = '',
type = "property",
property = som.cluster$order,
palette.name = topo.colors)
add.cluster.boundaries(som_model, som.cluster$order)
# Hierarchical K-Means Clustering: Optimize Clusters
# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/
# Install factoextra package if needed
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
# Load factoextra library
library(factoextra)
res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k)
# Elements returned by hkmeans()
names(res.hk)
# Print the results
str(res.hk)
# Visualize the tree
fviz_dend(res.hk, cex = 0.6, palette = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
# Visualize the hkmeans final clusters
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
ggtheme = theme_classic())
plot(som_model,
main = '',
type = "property",
property = res.hk$hclust$order,
palette.name = topo.colors)
add.cluster.boundaries(som_model, res.hk$hclust$order)