# Install kohonen package if needed if(!require(kohonen)) install.packages("kohonen") # Load kohonen library library(kohonen) # Install kohonen package if needed if(!require(parallel)) install.packages("parallel") # Load parallel library library(parallel) # Keep only numeric columns, ignoring dates and looped. train_num <- train_set %>% select(length, weight, count, neighbors, income) # SOM function can only work on matrices train_mat <- as.matrix(scale(train_num)) # Switching to supervised SOMs test_num <- test_set %>% select(length, weight, count, neighbors, income) # Note that when we rescale our testing data we need to scale it according to how we scaled our training data. test_mat <- as.matrix(scale(test_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) # Binary outputs, black=ransomware, white=non-ransomware, train set train_bw <- train_set$bw %>% classvec2classmat() # Same for test set test_bw <- test_set$bw %>% classvec2classmat() # Create Data list for supervised SOM # train_list <- list(independent = train_mat, dependent = train_bw) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps # Formulaic method 1 grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2 #grid_size = ceiling(sqrt(length(unique(ransomware$bw)))) grid_size # Create SOM grid train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) # Set magic seed for reproducibility set.seed(5) ## Now build the model. som_model <- xyf(train_mat, train_bw, grid = train_grid, rlen = 100, mode="pbatch", # or: alpha = c(0.05,0.01), cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training keep.data = TRUE ) # Visualize clusters plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors) # Distance map plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors) # Visualize counts plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors) # Visualize fan diagram plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 1 plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 2 plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 3 plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 4 plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 5 plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) # Now test predictions # https://clarkdatalabs.github.io/soms/SOM_NBA test_list <- list(independent = test_mat, dependent = test_bw) ransomware.prediction <- predict(som_model, newdata = test_list) table(test_set$bw, ransomware.prediction$prediction[[2]]) # Confusion Matrix cm_bw <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$bw) cm_bw$overall["Accuracy"] cm_bw # Now test predictions of validation set # Switching to supervised SOMs valid_num <- validation %>% select(length, weight, count, neighbors, income) # Note that when we rescale our testing data we need to scale it according to how we scaled our training data. valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) valid_bw <- validation$bw valid_list <- list(independent = valid_mat, dependent = valid_bw) # Requires up to 16GB of RAM, skip if resources are limited ransomware.prediction.validation <- predict(som_model, newdata = valid_list) table(validation$bw, ransomware.prediction.validation$prediction[[2]]) # Confusion Matrix cm_bw.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$bw) cm_bw.validation$overall["Accuracy"] cm_bw.validation # Clean up environment rm(dest_file, url, temp, grid_size)