2021-09-30 09:19:18 +02:00
|
|
|
# Install kohonen package if needed
|
|
|
|
if(!require(kohonen)) install.packages("kohonen")
|
|
|
|
|
|
|
|
# Load kohonen library
|
|
|
|
library(kohonen)
|
|
|
|
|
|
|
|
# Install kohonen package if needed
|
|
|
|
if(!require(parallel)) install.packages("parallel")
|
|
|
|
|
|
|
|
# Load parallel library
|
|
|
|
library(parallel)
|
|
|
|
|
|
|
|
# Keep only numeric columns, ignoring dates and looped.
|
|
|
|
train_num <- train_set %>% select(length, weight, count, neighbors, income)
|
|
|
|
|
|
|
|
# SOM function can only work on matrices
|
|
|
|
train_mat <- as.matrix(scale(train_num))
|
|
|
|
|
|
|
|
# Switching to supervised SOMs
|
|
|
|
test_num <- test_set %>% select(length, weight, count, neighbors, income)
|
|
|
|
|
|
|
|
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
|
|
|
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
|
|
|
|
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
|
|
|
|
|
|
|
# Binary outputs, black=ransomware, white=non-ransomware, train set
|
|
|
|
train_grey <- train_set$grey %>% classvec2classmat()
|
|
|
|
|
|
|
|
# Same for test set
|
|
|
|
test_grey <- test_set$grey %>% classvec2classmat()
|
|
|
|
|
|
|
|
# Create Data list for supervised SOM
|
|
|
|
#
|
|
|
|
train_list <- list(independent = train_mat, dependent = train_grey)
|
|
|
|
|
|
|
|
# Calculate idea grid size according to:
|
|
|
|
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
|
|
|
|
|
|
|
# Formulaic method 1
|
|
|
|
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
|
|
|
# Based on categorical number, method 2
|
|
|
|
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
|
|
|
|
grid_size
|
|
|
|
|
|
|
|
# Create SOM grid
|
|
|
|
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
|
|
|
|
|
|
|
|
# Set magic seed for reproducibility
|
|
|
|
set.seed(5)
|
|
|
|
|
|
|
|
## Now build the model.
|
|
|
|
som_model <- xyf(train_mat, train_grey,
|
|
|
|
grid = train_grid,
|
|
|
|
rlen = 100,
|
|
|
|
mode="pbatch", # or: alpha = c(0.05,0.01),
|
|
|
|
cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
|
|
|
|
keep.data = TRUE
|
|
|
|
)
|
|
|
|
|
|
|
|
# Visualize clusters
|
|
|
|
plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Distance map
|
|
|
|
plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize counts
|
|
|
|
plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize fan diagram
|
|
|
|
plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize heatmap for variable 1
|
|
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize heatmap for variable 2
|
|
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize heatmap for variable 3
|
|
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize heatmap for variable 4
|
|
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Visualize heatmap for variable 5
|
|
|
|
plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
|
|
|
|
|
|
|
|
# Now test predictions
|
|
|
|
# https://clarkdatalabs.github.io/soms/SOM_NBA
|
|
|
|
|
|
|
|
test_list <- list(independent = test_mat, dependent = test_grey)
|
|
|
|
|
|
|
|
ransomware.prediction <- predict(som_model, newdata = test_list)
|
|
|
|
table(test_set$grey, ransomware.prediction$prediction[[2]])
|
|
|
|
|
|
|
|
# Confusion Matrix
|
|
|
|
cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)
|
|
|
|
cm_grey$overall["Accuracy"]
|
|
|
|
cm_grey
|
|
|
|
|
|
|
|
# Now test predictions of validation set
|
|
|
|
|
|
|
|
# Switching to supervised SOMs
|
|
|
|
valid_num <- validation %>% select(length, weight, count, neighbors, income)
|
|
|
|
|
|
|
|
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
|
|
|
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
|
|
|
|
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
|
|
|
|
|
|
|
valid_grey <- validation$grey
|
|
|
|
|
|
|
|
valid_list <- list(independent = valid_mat, dependent = valid_grey)
|
|
|
|
|
|
|
|
# Requires up to 16GB of RAM, skip if resources are limited
|
|
|
|
ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
|
|
|
|
table(validation$grey, ransomware.prediction.validation$prediction[[2]])
|
|
|
|
|
|
|
|
# Confusion Matrix
|
|
|
|
cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)
|
|
|
|
cm_grey.validation$overall["Accuracy"]
|
|
|
|
cm_grey.validation
|
|
|
|
|
|
|
|
# Clean up environment
|
2021-09-30 10:34:15 +02:00
|
|
|
rm(dest_file, url, temp, grid_size)
|