ransomware/Binary_SOM.R

# Install kohonen package if needed
if(!require(kohonen)) install.packages("kohonen")

# Load kohonen library
library(kohonen)

# Install kohonen package if needed
if(!require(parallel)) install.packages("parallel")

# Load parallel library
library(parallel)

# Keep only numeric columns, ignoring dates and looped.
train_num <- train_set %>% select(length, weight, count, neighbors, income)

# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))

# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)

# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))

# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_set$grey %>% classvec2classmat()

# Same for test set
test_grey <- test_set$grey %>% classvec2classmat()

# Create Data list for supervised SOM
# 
train_list <- list(independent = train_mat, dependent = train_grey)

# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps

# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
grid_size

# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)

# Set magic seed for reproducibility
set.seed(5)

## Now build the model.
som_model <- xyf(train_mat, train_grey,
                 grid = train_grid, 
                 rlen = 100,
                 mode="pbatch", # or: alpha = c(0.05,0.01),
                 cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
                 keep.data = TRUE
)

# Visualize clusters
plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)

# Distance map
plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)

# Visualize counts
plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)

# Visualize fan diagram
plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 1
plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 2
plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 3
plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 4
plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 5
plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)

# Now test predictions
# https://clarkdatalabs.github.io/soms/SOM_NBA

test_list <- list(independent = test_mat, dependent = test_grey)

ransomware.prediction <- predict(som_model, newdata = test_list)
table(test_set$grey, ransomware.prediction$prediction[[2]])

# Confusion Matrix
cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)
cm_grey$overall["Accuracy"]
cm_grey

# Now test predictions of validation set

# Switching to supervised SOMs
valid_num <- validation %>% select(length, weight, count, neighbors, income)

# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))

valid_grey <- validation$grey

valid_list <- list(independent = valid_mat, dependent = valid_grey)

# Requires up to 16GB of RAM, skip if resources are limited
ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
table(validation$grey, ransomware.prediction.validation$prediction[[2]])

# Confusion Matrix
cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)
cm_grey.validation$overall["Accuracy"]
cm_grey.validation

# Clean up environment
rm(dest_file, url, temp, grid_size)
Code is in mostly working state. Need to fix how the categorical data set is generated. Need to celan up and fix some of the images and graphs. THEN, the report can be written. 2021-09-30 09:19:18 +02:00			`# Install kohonen package if needed`
			`if(!require(kohonen)) install.packages("kohonen")`

			`# Load kohonen library`
			`library(kohonen)`

			`# Install kohonen package if needed`
			`if(!require(parallel)) install.packages("parallel")`

			`# Load parallel library`
			`library(parallel)`

			`# Keep only numeric columns, ignoring dates and looped.`
			`train_num <- train_set %>% select(length, weight, count, neighbors, income)`

			`# SOM function can only work on matrices`
			`train_mat <- as.matrix(scale(train_num))`

			`# Switching to supervised SOMs`
			`test_num <- test_set %>% select(length, weight, count, neighbors, income)`

			`# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.`
			`test_mat <- as.matrix(scale(test_num, center = attr(train_mat,`
			`"scaled:center"), scale = attr(train_mat, "scaled:scale")))`

			`# Binary outputs, black=ransomware, white=non-ransomware, train set`
			`train_grey <- train_set$grey %>% classvec2classmat()`

			`# Same for test set`
			`test_grey <- test_set$grey %>% classvec2classmat()`

			`# Create Data list for supervised SOM`
			`#`
			`train_list <- list(independent = train_mat, dependent = train_grey)`

			`# Calculate idea grid size according to:`
			`# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps`

			`# Formulaic method 1`
			`grid_size <- round(sqrt(5*sqrt(nrow(train_set))))`
			`# Based on categorical number, method 2`
			`#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))`
			`grid_size`

			`# Create SOM grid`
			`train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)`

			`# Set magic seed for reproducibility`
			`set.seed(5)`

			`## Now build the model.`
			`som_model <- xyf(train_mat, train_grey,`
			`grid = train_grid,`
			`rlen = 100,`
			`mode="pbatch", # or: alpha = c(0.05,0.01),`
			`cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training`
			`keep.data = TRUE`
			`)`

			`# Visualize clusters`
			`plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)`

			`# Distance map`
			`plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)`

			`# Visualize counts`
			`plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)`

			`# Visualize fan diagram`
			`plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)`

			`# Visualize heatmap for variable 1`
			`plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)`

			`# Visualize heatmap for variable 2`
			`plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)`

			`# Visualize heatmap for variable 3`
			`plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)`

			`# Visualize heatmap for variable 4`
			`plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)`

			`# Visualize heatmap for variable 5`
			`plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)`

			`# Now test predictions`
			`# https://clarkdatalabs.github.io/soms/SOM_NBA`

			`test_list <- list(independent = test_mat, dependent = test_grey)`

			`ransomware.prediction <- predict(som_model, newdata = test_list)`
			`table(test_set$grey, ransomware.prediction$prediction[[2]])`

			`# Confusion Matrix`
			`cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)`
			`cm_grey$overall["Accuracy"]`
			`cm_grey`

			`# Now test predictions of validation set`

			`# Switching to supervised SOMs`
			`valid_num <- validation %>% select(length, weight, count, neighbors, income)`

			`# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.`
			`valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,`
			`"scaled:center"), scale = attr(train_mat, "scaled:scale")))`

			`valid_grey <- validation$grey`

			`valid_list <- list(independent = valid_mat, dependent = valid_grey)`

			`# Requires up to 16GB of RAM, skip if resources are limited`
			`ransomware.prediction.validation <- predict(som_model, newdata = valid_list)`
			`table(validation$grey, ransomware.prediction.validation$prediction[[2]])`

			`# Confusion Matrix`
			`cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)`
			`cm_grey.validation$overall["Accuracy"]`
			`cm_grey.validation`

			`# Clean up environment`
remove grid_size from categorical som script 2021-09-30 10:34:15 +02:00			`rm(dest_file, url, temp, grid_size)`