diff --git a/Detecting_Bitcoin_Ransomware.R b/Detecting_Bitcoin_Ransomware.R new file mode 100644 index 0000000..f4f81e1 --- /dev/null +++ b/Detecting_Bitcoin_Ransomware.R @@ -0,0 +1,242 @@ +############################################################################### +## Ransomware Detection on the Bitcoin Blockchain +## using Random Forests and Self Organizing Maps +## +## Kaylee Robert Tejeda +## October 31, 2021 +## +## Submitted as part of final CYO project for +## HarvardX PH125.9x Capstone Course +## +############################################################################### + +# Timer command, uncomment following lines to time script +library(tictoc) +tic(quiet = FALSE) + +# Install necessary packages if not already present +if(!require(tidyverse)) install.packages("tidyverse") +if(!require(caret)) install.packages("caret") +if(!require(randomForest)) install.packages("randomForest") +if(!require(kohonen)) install.packages("kohonen") +if(!require(parallel)) install.packages("parallel") +if(!require(matrixStats)) install.packages("matrixStats") + +# Load Libraries +library(tidyverse) +library(caret) +library(randomForest) +library(kohonen) +library(parallel) +library(matrixStats) + +# Download data +url <- + "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" +dest_file <- "data/data.zip" +if(!dir.exists("data"))dir.create("data") +if(!file.exists(dest_file))download.file(url, destfile = dest_file) + +# Unzip into CSV +if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, + "BitcoinHeistData.csv", + exdir="data") + +# Import data from CSV +ransomware <- read_csv("data/BitcoinHeistData.csv") + +# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware +ransomware <- ransomware %>% + mutate(label=as.factor(label), + bw=as.factor(ifelse(label=="white", "white", "black"))) + +# Validation set made from 50% of BitcoinHeist data +test_index <- createDataPartition(y = ransomware$bw, + times = 1, p = .5, list = FALSE) + +workset <- ransomware[-test_index,] +validation <- ransomware[test_index,] + +# Split the working set into a training set and a test set @ 50% +test_index <- createDataPartition(y = workset$bw, + times = 1, p = .5, list = FALSE) + +train_set <- workset[-test_index,] +test_set <- workset[test_index,] + +############################################################################### +## Data preparation is now done +## Separate into "black" and "white" groups using Random Forests predictions +############################################################################### + +# Keep only numeric columns, ignoring temporal features +ransomware_num <- ransomware %>% + select(length, weight, count, looped, neighbors, income) + +# Check for variation across numerical columns using coefficients of variation +# +# Calculate standard deviations for each column +sds <- ransomware_num %>% as.matrix() %>% colSds() + +# Calculate means for each column +means <- ransomware_num %>% as.matrix() %>% colMeans() + +# Calculate CVs for each column +coeff_vars <- sds %/% means + +# Select the two features with the highest coefficients of variation +selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2] + +message("The features with the highest coefficients of variation are ", + selected_features[1], selected_features[2], + ", which will be used to train the binary model.") + +#Sample every 100th row due to memory constraints +train_samp <- train_set[seq(1, nrow(train_set), 100), ] + +# Keep only numeric columns with highest coefficients of variation +train_num <- train_samp %>% select(selected_features[1], selected_features[2]) + +# Binary outputs, black = ransomware, white = non-ransomware, train set +train_bw <- train_samp$bw + +#Sample every nth row due to memory constraints +test_samp <- test_set[seq(1, nrow(train_set), 100), ] + +# Dimension reduction again +test_num <- test_samp %>% select(selected_features[1], selected_features[2]) + +# Same for test set +test_bw <- test_samp$bw + +# Lower CV numbers +control <- trainControl(method="cv", number = 10) +grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) + +# Train Random Forests model +rf_model <- train(train_num, train_bw, method="rf", + trControl = control, tuneGrid=grid) + +# Fit model +fit_rf <- randomForest(train_samp, train_bw, + minNode = rf_model$bestTune$mtry) + +# Measure accuracy of model against test sample +y_hat_rf <- predict(fit_rf, test_samp) +cm <- confusionMatrix(y_hat_rf, test_bw) +message("Overall accuracy for the binary separation is ", + cm$overall["Accuracy"]) +cm + +############################################################################## +## Now we use the Random Forest model to exclude the "white" addresses from +## the full ransomware set, to categorize the "black" addresses into families. +message("Now we further categorize black address into ransomware families.") +############################################################################## + +# Measure accuracy of model against full ransomware set +ransomware_y_hat_rf <- predict(fit_rf, ransomware) +cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw) +message("Overall accuracy for the full data set is ", cm$overall["Accuracy"]) +cm + +# Now use this prediction to reduce the original set to only "black" addresses +# First append the full set of predictions to the original set. +ransomware$predictions <- ransomware_y_hat_rf + +# Filter out all the predicted "white" addresses, +# leaving only predicted "black" addresses +black_addresses <- ransomware %>% filter(predictions=="black") + +# Split the reduced black-predictions into a training set and a test set @ 50% +test_index <- createDataPartition(y = black_addresses$predictions, + times = 1, p = .5, list = FALSE) + +train_set <- black_addresses[-test_index,] +test_set <- black_addresses[test_index,] + + +# Keep only numeric columns, ignoring temporal variables. +train_num <- train_set %>% + select(length, weight, count, looped, neighbors, income) + +# SOM function can only work on matrices +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% + select(length, weight, count, looped, neighbors, income) + +# Testing data is scaled according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, + center = attr(train_mat, "scaled:center"), + scale = attr(train_mat, "scaled:scale"))) + +# Categorical +train_label <- train_set$label %>% classvec2classmat() + +# Same for test set +test_label <- test_set$label %>% classvec2classmat() + +# Create Data list for supervised SOM +train_list <- list(independent = train_mat, dependent = train_label) + +# Calculate idea grid size according to: +# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps + +# Formulaic method 1 +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) + +# Based on categorical number, method 2 +#grid_size = ceiling(sqrt(length(unique(ransomware$label)))) + +grid_size + +# Create SOM grid +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, + topo="hexagonal", toroidal = TRUE) + +## Now build the model. +som_model2 <- xyf(train_mat, train_label, + grid = train_grid, + rlen = 100, + mode="pbatch", + cores = detectCores(), + keep.data = TRUE +) + +# Now test predictions of test set + +test_list <- list(independent = test_mat, dependent = test_label) + +ransomware_group.prediction <- predict(som_model2, newdata = test_list) +table(test_set$label, ransomware_group.prediction$prediction[[2]]) + +# Confusion Matrix +cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], + test_set$label) +message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"]) +cm_labels + + +############################################################################# +## K-Means Clustering to visualize the categorization of the SOM +## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ +############################################################################# + +# Set number of clusters to be equal to number of known ransomware groups +n_groups <- length(unique(ransomware$label)) - 1 +n_groups + +som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) + +plot(som_model2, + main = 'K-Means Clustering', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model2, som.cluster$cluster) + +#End timer +toc() + diff --git a/Final_method.R b/Final_method.R index 0449f59..43b00a8 100644 --- a/Final_method.R +++ b/Final_method.R @@ -4,8 +4,10 @@ ## ## Kaylee Robert Tejeda ## October 31, 2021 +## +## Submitted as part of final CYO project for +## HarvardX PH125.9x Capstone Course ## -## Make this header better!!!! ################################################# # Timer command, uncomment following lines to time script @@ -18,6 +20,7 @@ if(!require(caret)) install.packages("caret") if(!require(randomForest)) install.packages("randomForest") if(!require(kohonen)) install.packages("kohonen") if(!require(parallel)) install.packages("parallel") +if(!require(matrixStats)) install.packages("matrixStats") # Load Libraries library(tidyverse) @@ -25,53 +28,80 @@ library(caret) library(randomForest) library(kohonen) library(parallel) +library(matrixStats) # Download data -url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" +url <- + "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" dest_file <- "data/data.zip" if(!dir.exists("data"))dir.create("data") if(!file.exists(dest_file))download.file(url, destfile = dest_file) # Unzip into CSV -if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data") +if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, + "BitcoinHeistData.csv", + exdir="data") # Import data from CSV ransomware <- read_csv("data/BitcoinHeistData.csv") # Turn labels into factors, bw is a binary factor for ransomware/non-ransomware -ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black"))) +ransomware <- ransomware %>% + mutate(label=as.factor(label), + bw=as.factor(ifelse(label=="white", "white", "black"))) -# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw) -test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE) +# Validation set made from 50% of BitcoinHeist data +test_index <- createDataPartition(y = ransomware$bw, + times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] validation <- ransomware[test_index,] -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) -test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE) +# Split the working set into a training set and a test set @ 50%, +test_index <- createDataPartition(y = workset$bw, + times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] test_set <- workset[test_index,] -# Separate into Black and White groups using Random Forests +## Separate into "black" and "white" groups using Random Forests -message("First to separate in to black and white groups.") +# Keep only numeric columns, ignoring temporal features +ransomware_num <- ransomware %>% + select(length, weight, count, looped, neighbors, income) -#Sample every nth row due to memory constraints +# Check for variation across numerical columns using coefficients of variation + +# Calculate standard deviations for each column +sds <- ransomware_num %>% as.matrix() %>% colSds() + +# Calculate means for each column +means <- ransomware_num %>% as.matrix() %>% colMeans() + +# Calculate CVs for each column +coeff_vars <- sds %/% means + +# Select the two features with the highest coefficients of variation +selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2] + +message("The features with the highest coefficients of variation are ", + selected_features[1], selected_features[2], + ", which will be used to train the binary model.") + +#Sample every 100th row due to memory constraints train_samp <- train_set[seq(1, nrow(train_set), 100), ] -# Keep only numeric columns with highest coefficients of variation for dimension reduction -train_num <- train_samp %>% select(neighbors, income) +# Keep only numeric columns with highest coefficients of variation +train_num <- train_samp %>% select(selected_features[1], selected_features[2]) -# Binary outputs, black=ransomware, white=non-ransomware, train set +# Binary outputs, black = ransomware, white = non-ransomware, train set train_bw <- train_samp$bw #Sample every nth row due to memory constraints -set.seed(23) test_samp <- test_set[seq(1, nrow(train_set), 100), ] # Dimension reduction again -test_num <- test_samp %>% select(neighbors, income) +test_num <- test_samp %>% select(selected_features[1], selected_features[2]) # Same for test set test_bw <- test_samp$bw @@ -81,7 +111,8 @@ control <- trainControl(method="cv", number = 10) grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) # Train Random Forests model -rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid) +rf_model <- train(train_num, train_bw, method="rf", + trControl = control, tuneGrid=grid) # Fit model fit_rf <- randomForest(train_samp, train_bw, @@ -90,81 +121,50 @@ fit_rf <- randomForest(train_samp, train_bw, # Measure accuracy of model against test sample y_hat_rf <- predict(fit_rf, test_samp) cm <- confusionMatrix(y_hat_rf, test_bw) -message("Overall accuracy for the test set is ", cm$overall["Accuracy"]) +message("Overall accuracy for the binary separation is ", + cm$overall["Accuracy"]) cm -# Measure accuracy of model against full validation set - -y_hat_rf <- predict(fit_rf, validation) -cm <- confusionMatrix(y_hat_rf, validation$bw) -message("Overall accuracy for the validation set is ", cm$overall["Accuracy"]) -cm # From here, trim down set to ONLY the black addresses and apply SOMs... message("Now we further categorize black address into ransomware families.") -# Try categorical SOMs on black-only addresses.... -#!! This is NOT right, is it? -#!! It would be even MORE impressive if I removed all the PREDICTED whites from -#!! the test set instead and started there. +# Measure accuracy of model against full ransomware set +ransomware_y_hat_rf <- predict(fit_rf, ransomware) +cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw) +message("Overall accuracy for the full data set is ", cm$overall["Accuracy"]) +cm -blacks <- ransomware %>% filter(!label=="white") +# Now use this prediction to reduce the original set to only "black" addresses -# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes -set.seed(23) -test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE) +ransomware$predictions <- ransomware_y_hat_rf -workset_blacks <- blacks[-test_index,] -temp <- blacks[test_index,] +black_addresses <- ransomware %>% filter(predictions=="black") -# Make sure addresses in validation set are also in working set... -# validation <- temp %>% -# semi_join(workset, by = "address") +# Split the reduced black-predictions into a training set and a test set @ 50% +test_index <- createDataPartition(y = black_addresses$predictions, + times = 1, p = .5, list = FALSE) -# Add rows removed from validation set back into working set... -#removed <- anti_join(temp, validation) -#workset <- rbind(workset, removed) +train_set <- workset[-test_index,] +test_set <- workset[test_index,] -# ... Or not -validation_blacks <- temp -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) -set.seed(5) -test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE) - -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes -#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) - -train_set <- workset_blacks[-test_index,] -temp <- workset_blacks[test_index,] - -# Make sure addresses in validation set are also in working set.... -#test_set <- temp %>% -# semi_join(train_set, by = "address") - -# Add rows removed from validation set back into working set.... -#removed <- anti_join(temp, test_set) -#train_set <- rbind(train_set, removed) - -# ....Or not -test_set <- temp - -##!! Data preparation is done, now focusing on Self Organizing Maps as our method -##!! Start here after reworking the data prep steps above. - -# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) -train_num <- train_set %>% select(length, weight, count, neighbors, income) +# Keep only numeric columns, ignoring temporal variables. +train_num <- train_set %>% + select(length, weight, count, looped, neighbors, income) # SOM function can only work on matrices train_mat <- as.matrix(scale(train_num)) # Switching to supervised SOMs -test_num <- test_set %>% select(length, weight, count, neighbors, income) +test_num <- test_set %>% + select(length, weight, count, looped, neighbors, income) -# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. -test_mat <- as.matrix(scale(test_num, center = attr(train_mat, - "scaled:center"), scale = attr(train_mat, "scaled:scale"))) +# Testing data is scaled according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, + center = attr(train_mat, "scaled:center"), + scale = attr(train_mat, "scaled:scale"))) # Categorical train_label <- train_set$label %>% classvec2classmat() @@ -186,17 +186,15 @@ grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) grid_size # Create SOM grid -train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) - -# Set magic seed number -set.seed(23) +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, + topo="hexagonal", toroidal = TRUE) ## Now build the model. som_model2 <- xyf(train_mat, train_label, grid = train_grid, rlen = 100, - mode="pbatch", # or: alpha = c(0.05,0.01), - cores = detectCores(), # detectCores() - 1 if system locks during calculation + mode="pbatch", + cores = detectCores(), keep.data = TRUE ) @@ -209,32 +207,40 @@ ransomware_group.prediction <- predict(som_model2, newdata = test_list) #table(test_set$label, ransomware_group.prediction$prediction[[2]]) # Confusion Matrix -cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label) +cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], + test_set$label) message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"]) #cm_labels # Now test predictions of validation set # Switching to supervised SOMs -valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income) +valid_num <- validation_blacks %>% + select(length, weight, count, looped, neighbors, income) -# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. -valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, - "scaled:center"), scale = attr(train_mat, "scaled:scale"))) +# Validation data is scaled using center and spread of training data. +valid_mat <- as.matrix(scale(valid_num, + center = attr(train_mat, "scaled:center"), + scale = attr(train_mat, "scaled:scale"))) valid_label <- validation_blacks$label valid_list <- list(independent = valid_mat, dependent = valid_label) -ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list) -#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) +ransomware_group.prediction.validation <- predict(som_model2, + newdata = valid_list) +#table(validation_blacks$label, +#ransomware_group.prediction.validation$prediction[[2]]) # Confusion Matrix -cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label) -message("Overall accuracy for the validation set is ",cm_labels.validation$overall["Accuracy"]) +cm_labels.validation <- + confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], + validation_blacks$label) +message("Overall accuracy for the validation set is ", + cm_labels.validation$overall["Accuracy"]) #cm_labels.validation -# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) +# Set number of clusters to be equal to number of known ransomware groups n_groups <- length(unique(ransomware$label)) - 1 n_groups