############################################################################### ## Ransomware Detection on the Bitcoin Blockchain ## using Random Forests and Self Organizing Maps ## ## Kaylee Robert Tejeda ## November 11, 2021 ## ## Submitted as part of final CYO project for ## HarvardX PH125.9x Capstone Course ## ############################################################################### # Uncomment next line to time script #tic() # Set the repository mirror to “1: 0-Cloud” for maximum availability r = getOption("repos") r["CRAN"] = "http://cran.rstudio.com" options(repos = r) rm(r) # Install necessary packages if not already present if(!require(tidyverse)) install.packages("tidyverse") if(!require(caret)) install.packages("caret") if(!require(randomForest)) install.packages("randomForest") if(!require(kohonen)) install.packages("kohonen") if(!require(parallel)) install.packages("parallel") if(!require(matrixStats)) install.packages("matrixStats") # Load Libraries library(tidyverse) library(caret) library(randomForest) library(kohonen) library(parallel) library(matrixStats) # Download data url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" dest_file <- "data/data.zip" if(!dir.exists("data"))dir.create("data") if(!file.exists(dest_file))download.file(url, destfile = dest_file) # Unzip as CSV if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data") # Import data from CSV ransomware <- read_csv("data/BitcoinHeistData.csv") # Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black"))) # Validation set made from 50% of BitcoinHeist data, for RAM considerations test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] validation <- ransomware[test_index,] # Split the working set into a training set and a test set @ 50%, RAM dictated test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] test_set <- workset[test_index,] ############################################################################### ## Data preparation is now done ## Separate into "black" and "white" groups using Random Forests predictions ############################################################################### # Keep only numeric columns, ignoring temporal features ransomware_num <- ransomware %>% select(length, weight, count, looped, neighbors, income) # Check for variation across numerical columns using coefficients of variation # # Calculate standard deviations for each column sds <- ransomware_num %>% as.matrix() %>% colSds() # Calculate means for each column means <- ransomware_num %>% as.matrix() %>% colMeans() # Calculate CVs for each column coeff_vars <- sds %/% means # Select the two features with the highest coefficients of variation selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2] message("The features with the highest coefficients of variation are ", selected_features[1], " and ", selected_features[2], ", which will be used to train the binary model.") # Sample every 100th row due to memory constraints train_samp <- train_set[seq(1, nrow(train_set), 100), ] # Keep only numeric columns with highest coefficients of variation train_num <- train_samp %>% select(selected_features[1], selected_features[2]) # Binary labels, black = ransomware, white = non-ransomware, train set train_bw <- train_samp$bw #Sample every 100th row due to memory constraints to make test sample same size test_samp <- test_set[seq(1, nrow(train_set), 100), ] # Dimension reduction again, selecting features with highest CVs test_num <- test_samp %>% select(selected_features[1], selected_features[2]) # Binary labels for test set test_bw <- test_samp$bw # Cross Validation, ten fold control <- trainControl(method="cv", number = 10) # Control grid with variation on mtry grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) # Run Cross Validation using control and grid set above rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid) # Supervised fit of model using cross validated optimization fit_rf <- randomForest(train_samp, train_bw, minNode = rf_model$bestTune$mtry) # Measure accuracy of model against test sample y_hat_rf <- predict(fit_rf, test_samp) cm_test <- confusionMatrix(y_hat_rf, test_bw) message("Overall accuracy for the binary separation is ", cm_test$overall["Accuracy"]) cm_test # Measure accuracy of model against full ransomware set ransomware_y_hat_rf <- predict(fit_rf, ransomware) cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw) message("Overall accuracy for the full data set is ", cm_ransomware$overall["Accuracy"]) cm_ransomware ############################################################################## ## Now we use the Random Forest model to exclude the "white" addresses from ## the full ransomware set, to categorize the "black" addresses into families. ############################################################################## # Now use this prediction to reduce the original set to only "black" addresses # First append the full set of predictions to the original set ransomware$prediction <- ransomware_y_hat_rf # Filter out all the predicted "white" addresses, # leaving only predicted "black" addresses black_addresses <- ransomware %>% filter(prediction=="black") # Split the reduced black-predictions into a training set and a test set @ 50% test_index <- createDataPartition(y = black_addresses$prediction, times = 1, p = .5, list = FALSE) train_set <- black_addresses[-test_index,] test_set <- black_addresses[test_index,] # Keep only numeric columns, ignoring temporal variables train_num <- train_set %>% select(length, weight, count, looped, neighbors, income) # SOM function can only work on matrices train_mat <- as.matrix(scale(train_num)) # Select non-temporal numerical features only test_num <- test_set %>% select(length, weight, count, looped, neighbors, income) # Testing data is scaled according to how we scaled our training data test_mat <- as.matrix(scale(test_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) # Categorical labels for training set train_label <- train_set$label %>% classvec2classmat() # Same for test set test_label <- test_set$label %>% classvec2classmat() # Create data list for supervised SOM train_list <- list(independent = train_mat, dependent = train_label) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps # Formulaic method 1, makes a larger graph in this case grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2, smaller graph with less cells #grid_size = ceiling(sqrt(length(unique(ransomware$label)))) message("A grid size of ", grid_size, " has been chosen.") # Create SOM grid train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) ## Now build the SOM model using the supervised method xyf() som_model2 <- xyf(train_mat, train_label, grid = train_grid, rlen = 100, mode="pbatch", cores = detectCores(), # Use all cores # cores = detectCores() - 1, # Leave one core for system keep.data = TRUE ) # Now test predictions of test set, create data list for test set test_list <- list(independent = test_mat, dependent = test_label) # Generate predictions ransomware_group.prediction <- predict(som_model2, newdata = test_list) table(test_set$label, ransomware_group.prediction$prediction[[2]]) # Confusion Matrix cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label) cm_labels ############################################################################# ## K-Means Clustering to visualize the categorization of the SOM ## For a good tutorial, visit: ## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ ############################################################################# # Set number of clusters to be equal to number of known ransomware groups n_groups <- length(unique(ransomware$label)) - 1 # Generate k-means clustering som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) # Plot clustering results plot(som_model2, main = 'K-Means Clustering', type = "property", property = som.cluster$cluster, palette.name = topo.colors) add.cluster.boundaries(som_model2, som.cluster$cluster) message("Overall accuracy is ", cm_labels$overall["Accuracy"]) # End timer #toc(quiet=FALSE)