ransomware/scratch/Clusters.R


#K-means requires that you specify the number of clusters in advance, which is always a fun chicken and egg problem.
#To solve it we use the within-clusters sum of squares method:
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/

wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var))
for (i in 2:12) {
  wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss)
}

plot(wss,
     pch = 19,
     ylab = 'Within-clusters sum of squares',
     xlab = 'K')

# Optimal number of clusters
# https://statsandr.com/blog/clustering-analysis-k-means-and-hierarchical-clustering-by-hand-and-in-r/


# Install factoextra package if needed
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")

# Load factoextra and NbClust libraries
library(factoextra)
library(NbClust)

# Elbow method
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation
  labs(subtitle = "Elbow method") # add subtitle

# Silhouette method
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

# Gap statistic
set.seed(5)
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans,
             nstart = 25,
             method = "gap_stat",
             nboot = 500 # reduce it for lower computation time (but less precise results)
) +
  labs(subtitle = "Gap statistic method")

nbclust_out <- NbClust(
  data = data.frame(som_model$codes[[1]]),
  distance = "euclidean",
  min.nc = 2, # minimum number of clusters
  max.nc = 12, # maximum number of clusters
  method = "kmeans" # one of: "ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid", "kmeans"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure.
##


# create a dataframe of the optimal number of clusters
nbclust_plot <- data.frame(clusters = nbclust_out$Best.nc[1, ])

# select only indices which select between 2 and 12 clusters
nbclust_plot <- subset(nbclust_plot, clusters >= 2 & clusters <= 12)

# create plot
ggplot(nbclust_plot) +
  aes(x = clusters) +
  geom_histogram(bins = 30L, fill = "#0c4c8a") +
  labs(x = "Number of clusters", y = "Frequency among all indices", title = "Optimal number of clusters") +
  theme_minimal()

# Based on exploration above, set number of clusers here.
# k <- nbclust_plot$clusters[which.max(nbclust_plot$???)]
k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))])
k

# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/

som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k)

plot(som_model,
     main = '',
     type = "property",
     property = som.cluster$cluster,
     palette.name = topo.colors)
add.cluster.boundaries(som_model, som.cluster$cluster)

# Hierarchical Clustering
# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R

som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single")

plot(som_model,
     main = '',
     type = "property",
     property = som.cluster$order,
     palette.name = topo.colors)

add.cluster.boundaries(som_model, som.cluster$order)

# Hierarchical K-Means Clustering: Optimize Clusters
# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/

# Install factoextra package if needed
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")

# Load factoextra library
library(factoextra)

res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k)

# Elements returned by hkmeans()
names(res.hk)

# Print the results
str(res.hk)

# Visualize the tree
fviz_dend(res.hk, cex = 0.6, palette = "jco",
          rect = TRUE, rect_border = "jco", rect_fill = TRUE)

# Visualize the hkmeans final clusters
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
             ggtheme = theme_classic())

plot(som_model,
     main = '',
     type = "property",
     property = res.hk$hclust$order,
     palette.name = topo.colors)

add.cluster.boundaries(som_model, res.hk$hclust$order)