# Try categorical SOMs on black-only addresses.... #!! This is NOT right, is it? #!! It would be even MORE impressive if I removed all the PREDICTED whites from #!! the test set instead and started there. library(tidyverse) library(caret) library(randomForest) library(kohonen) library(parallel) library(matrixStats) blacks <- ransomware %>% filter(!label=="white") # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes set.seed(5) test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE) workset_blacks <- blacks[-test_index,] temp <- blacks[test_index,] # Make sure addresses in validation set are also in working set... # validation <- temp %>% # semi_join(workset, by = "address") # Add rows removed from validation set back into working set... #removed <- anti_join(temp, validation) #workset <- rbind(workset, removed) # ... Or not validation_blacks <- temp # Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) set.seed(5) test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE) # Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes #test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) train_set <- workset_blacks[-test_index,] temp <- workset_blacks[test_index,] # Make sure addresses in validation set are also in working set.... #test_set <- temp %>% # semi_join(train_set, by = "address") # Add rows removed from validation set back into working set.... #removed <- anti_join(temp, test_set) #train_set <- rbind(train_set, removed) # ....Or not test_set <- temp ##!! Data preparation is done, now focusing on Self Organizing Maps as our method ##!! Start here after reworking the data prep steps above. # Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) train_num <- train_set %>% select(length, weight, count, neighbors, income) # SOM function can only work on matrices train_mat <- as.matrix(scale(train_num)) # Switching to supervised SOMs test_num <- test_set %>% select(length, weight, count, neighbors, income) # Note that when we rescale our testing data we need to scale it according to how we scaled our training data. test_mat <- as.matrix(scale(test_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) # Categorical train_label <- train_set$label %>% classvec2classmat() # Same for test set test_label <- test_set$label %>% classvec2classmat() # Create Data list for supervised SOM # train_list <- list(independent = train_mat, dependent = train_label) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps # Formulaic method 1 grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2 #grid_size = ceiling(sqrt(length(unique(ransomware$label)))) grid_size # Create SOM grid train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) # Set magic seed for reproducibility set.seed(5) ## Now build the model. som_model2 <- xyf(train_mat, train_label, grid = train_grid, rlen = 100, mode="pbatch", # or: alpha = c(0.05,0.01), cores = detectCores(), # detectCores() - 1 if system locks during calculation keep.data = TRUE ) # Visualize clusters plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors) # Distance map plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors) # Visualize counts plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors) # Visualize fan diagram plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 1 plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 2 plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 3 plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 4 plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 5 plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) # Now test predictions of test set # https://clarkdatalabs.github.io/soms/SOM_NBA test_list <- list(independent = test_mat, dependent = test_label) ransomware_group.prediction <- predict(som_model2, newdata = test_list) table(test_set$label, ransomware_group.prediction$prediction[[2]]) # Confusion Matrix cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label) cm_labels$overall["Accuracy"] cm_labels # Now test predictions of validation set # Switching to supervised SOMs valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income) # Note that when we rescale our testing data we need to scale it according to how we scaled our training data. valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) valid_label <- validation_blacks$label valid_list <- list(independent = valid_mat, dependent = valid_label) ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list) table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) # Confusion Matrix cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label) cm_labels.validation$overall["Accuracy"] cm_labels.validation # Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) n_groups <- length(unique(ransomware$label)) - 1 n_groups # K-Means Clustering # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) plot(som_model2, main = 'K-Means Clustering', type = "property", property = som.cluster$cluster, palette.name = topo.colors) add.cluster.boundaries(som_model2, som.cluster$cluster) # Clean up environment rm(grid_size, blacks, test_list, valid_list, temp, som.cluster)