99 lines
3.9 KiB
R
99 lines
3.9 KiB
R
# Install kohonen package if needed
|
|
if(!require(kohonen)) install.packages("kohonen")
|
|
|
|
# Load kohonen library
|
|
library(kohonen)
|
|
|
|
# Install kohonen package if needed
|
|
if(!require(parallel)) install.packages("parallel")
|
|
|
|
# Load parallel library
|
|
library(parallel)
|
|
|
|
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
|
|
train_num <- train_set %>% select(year, day, length, weight, count, looped, neighbors, income)
|
|
|
|
# SOM function can only work on matrices
|
|
train_mat <- as.matrix(scale(train_num))
|
|
|
|
# Switching to supervised SOMs
|
|
test_num <- test_set %>% select(year, day, length, weight, count, looped, neighbors, income)
|
|
|
|
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
|
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
|
|
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
|
|
|
## Treat as binary first, then maybe switch to categorical?
|
|
|
|
# Binary outputs, black=ransomware, white=non-ransomware, train set
|
|
train_grey <- train_set$grey %>% classvec2classmat()
|
|
|
|
# Samem for test set
|
|
test_grey <- test_set$grey %>% classvec2classmat()
|
|
|
|
# Create Data list for supervised SOM
|
|
#
|
|
train_list <- list(independent = train_mat, dependent = train_grey)
|
|
|
|
# Calculate idea grid size according to:
|
|
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
|
|
|
# Formulaic method 1
|
|
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
|
# Based on categorical number, method 2
|
|
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
|
|
grid_size
|
|
|
|
# Create SOM grid
|
|
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
|
|
|
|
# Set magic seed for reproducibility
|
|
set.seed(5)
|
|
|
|
## Now build the model.
|
|
som_model <- xyf(train_mat, train_grey,
|
|
grid = train_grid,
|
|
rlen = 100,
|
|
mode="pbatch", # or: alpha = c(0.05,0.01),
|
|
cores = detectCores(), # detectCores() - 1 if system locks during calculation
|
|
keep.data = TRUE
|
|
)
|
|
|
|
# Visualize clusters
|
|
plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
|
|
|
|
# Distance map
|
|
plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize counts
|
|
plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize fan diagram
|
|
plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 1
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 2
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 3
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 4
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 5
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 6
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 7
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors)
|
|
|
|
# Visualize heatmap for variable 8
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors)
|
|
|
|
##Different cluster methods branch off here...
|