260 lines
8.6 KiB
R
260 lines
8.6 KiB
R
##################################################
|
|
## Ransomware Detection on the Bitcoin Blockchain
|
|
## using Random Forests and Self Organizing Maps
|
|
##
|
|
## Kaylee Robert Tejeda
|
|
## October 31, 2021
|
|
##
|
|
## Submitted as part of final CYO project for
|
|
## HarvardX PH125.9x Capstone Course
|
|
##
|
|
#################################################
|
|
|
|
# Timer command, uncomment following lines to time script
|
|
library(tictoc)
|
|
tic(quiet = FALSE)
|
|
|
|
# Install necessary packages if not already present
|
|
if(!require(tidyverse)) install.packages("tidyverse")
|
|
if(!require(caret)) install.packages("caret")
|
|
if(!require(randomForest)) install.packages("randomForest")
|
|
if(!require(kohonen)) install.packages("kohonen")
|
|
if(!require(parallel)) install.packages("parallel")
|
|
if(!require(matrixStats)) install.packages("matrixStats")
|
|
|
|
# Load Libraries
|
|
library(tidyverse)
|
|
library(caret)
|
|
library(randomForest)
|
|
library(kohonen)
|
|
library(parallel)
|
|
library(matrixStats)
|
|
|
|
# Download data
|
|
url <-
|
|
"https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
|
|
dest_file <- "data/data.zip"
|
|
if(!dir.exists("data"))dir.create("data")
|
|
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
|
|
|
|
# Unzip into CSV
|
|
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
|
|
"BitcoinHeistData.csv",
|
|
exdir="data")
|
|
|
|
# Import data from CSV
|
|
ransomware <- read_csv("data/BitcoinHeistData.csv")
|
|
|
|
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
|
|
ransomware <- ransomware %>%
|
|
mutate(label=as.factor(label),
|
|
bw=as.factor(ifelse(label=="white", "white", "black")))
|
|
|
|
# Validation set made from 50% of BitcoinHeist data
|
|
test_index <- createDataPartition(y = ransomware$bw,
|
|
times = 1, p = .5, list = FALSE)
|
|
|
|
workset <- ransomware[-test_index,]
|
|
validation <- ransomware[test_index,]
|
|
|
|
# Split the working set into a training set and a test set @ 50%,
|
|
test_index <- createDataPartition(y = workset$bw,
|
|
times = 1, p = .5, list = FALSE)
|
|
|
|
train_set <- workset[-test_index,]
|
|
test_set <- workset[test_index,]
|
|
|
|
## Separate into "black" and "white" groups using Random Forests
|
|
|
|
# Keep only numeric columns, ignoring temporal features
|
|
ransomware_num <- ransomware %>%
|
|
select(length, weight, count, looped, neighbors, income)
|
|
|
|
# Check for variation across numerical columns using coefficients of variation
|
|
|
|
# Calculate standard deviations for each column
|
|
sds <- ransomware_num %>% as.matrix() %>% colSds()
|
|
|
|
# Calculate means for each column
|
|
means <- ransomware_num %>% as.matrix() %>% colMeans()
|
|
|
|
# Calculate CVs for each column
|
|
coeff_vars <- sds %/% means
|
|
|
|
# Select the two features with the highest coefficients of variation
|
|
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
|
|
|
|
message("The features with the highest coefficients of variation are ",
|
|
selected_features[1], selected_features[2],
|
|
", which will be used to train the binary model.")
|
|
|
|
#Sample every 100th row due to memory constraints
|
|
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
|
|
|
|
# Keep only numeric columns with highest coefficients of variation
|
|
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
|
|
|
|
# Binary outputs, black = ransomware, white = non-ransomware, train set
|
|
train_bw <- train_samp$bw
|
|
|
|
#Sample every nth row due to memory constraints
|
|
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
|
|
|
|
# Dimension reduction again
|
|
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
|
|
|
|
# Same for test set
|
|
test_bw <- test_samp$bw
|
|
|
|
# Lower CV numbers
|
|
control <- trainControl(method="cv", number = 10)
|
|
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
|
|
|
|
# Train Random Forests model
|
|
rf_model <- train(train_num, train_bw, method="rf",
|
|
trControl = control, tuneGrid=grid)
|
|
|
|
# Fit model
|
|
fit_rf <- randomForest(train_samp, train_bw,
|
|
minNode = rf_model$bestTune$mtry)
|
|
|
|
# Measure accuracy of model against test sample
|
|
y_hat_rf <- predict(fit_rf, test_samp)
|
|
cm <- confusionMatrix(y_hat_rf, test_bw)
|
|
message("Overall accuracy for the binary separation is ",
|
|
cm$overall["Accuracy"])
|
|
cm
|
|
|
|
|
|
# From here, trim down set to ONLY the black addresses and apply SOMs...
|
|
|
|
message("Now we further categorize black address into ransomware families.")
|
|
|
|
# Measure accuracy of model against full ransomware set
|
|
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
|
|
cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
|
|
message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
|
|
cm
|
|
|
|
# Now use this prediction to reduce the original set to only "black" addresses
|
|
|
|
ransomware$predictions <- ransomware_y_hat_rf
|
|
|
|
black_addresses <- ransomware %>% filter(predictions=="black")
|
|
|
|
# Split the reduced black-predictions into a training set and a test set @ 50%
|
|
test_index <- createDataPartition(y = black_addresses$predictions,
|
|
times = 1, p = .5, list = FALSE)
|
|
|
|
train_set <- workset[-test_index,]
|
|
test_set <- workset[test_index,]
|
|
|
|
|
|
# Keep only numeric columns, ignoring temporal variables.
|
|
train_num <- train_set %>%
|
|
select(length, weight, count, looped, neighbors, income)
|
|
|
|
# SOM function can only work on matrices
|
|
train_mat <- as.matrix(scale(train_num))
|
|
|
|
# Switching to supervised SOMs
|
|
test_num <- test_set %>%
|
|
select(length, weight, count, looped, neighbors, income)
|
|
|
|
# Testing data is scaled according to how we scaled our training data.
|
|
test_mat <- as.matrix(scale(test_num,
|
|
center = attr(train_mat, "scaled:center"),
|
|
scale = attr(train_mat, "scaled:scale")))
|
|
|
|
# Categorical
|
|
train_label <- train_set$label %>% classvec2classmat()
|
|
|
|
# Same for test set
|
|
test_label <- test_set$label %>% classvec2classmat()
|
|
|
|
# Create Data list for supervised SOM
|
|
#
|
|
train_list <- list(independent = train_mat, dependent = train_label)
|
|
|
|
# Calculate idea grid size according to:
|
|
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
|
|
|
# Formulaic method 1
|
|
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
|
# Based on categorical number, method 2
|
|
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
|
|
grid_size
|
|
|
|
# Create SOM grid
|
|
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
|
|
topo="hexagonal", toroidal = TRUE)
|
|
|
|
## Now build the model.
|
|
som_model2 <- xyf(train_mat, train_label,
|
|
grid = train_grid,
|
|
rlen = 100,
|
|
mode="pbatch",
|
|
cores = detectCores(),
|
|
keep.data = TRUE
|
|
)
|
|
|
|
# Now test predictions of test set
|
|
# https://clarkdatalabs.github.io/soms/SOM_NBA
|
|
|
|
test_list <- list(independent = test_mat, dependent = test_label)
|
|
|
|
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
|
|
#table(test_set$label, ransomware_group.prediction$prediction[[2]])
|
|
|
|
# Confusion Matrix
|
|
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
|
|
test_set$label)
|
|
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
|
|
#cm_labels
|
|
|
|
# Now test predictions of validation set
|
|
|
|
# Switching to supervised SOMs
|
|
valid_num <- validation_blacks %>%
|
|
select(length, weight, count, looped, neighbors, income)
|
|
|
|
# Validation data is scaled using center and spread of training data.
|
|
valid_mat <- as.matrix(scale(valid_num,
|
|
center = attr(train_mat, "scaled:center"),
|
|
scale = attr(train_mat, "scaled:scale")))
|
|
|
|
valid_label <- validation_blacks$label
|
|
|
|
valid_list <- list(independent = valid_mat, dependent = valid_label)
|
|
|
|
ransomware_group.prediction.validation <- predict(som_model2,
|
|
newdata = valid_list)
|
|
#table(validation_blacks$label,
|
|
#ransomware_group.prediction.validation$prediction[[2]])
|
|
|
|
# Confusion Matrix
|
|
cm_labels.validation <-
|
|
confusionMatrix(ransomware_group.prediction.validation$prediction[[2]],
|
|
validation_blacks$label)
|
|
message("Overall accuracy for the validation set is ",
|
|
cm_labels.validation$overall["Accuracy"])
|
|
#cm_labels.validation
|
|
|
|
# Set number of clusters to be equal to number of known ransomware groups
|
|
n_groups <- length(unique(ransomware$label)) - 1
|
|
n_groups
|
|
|
|
# K-Means Clustering
|
|
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
|
|
|
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
|
|
|
|
plot(som_model2,
|
|
main = 'K-Means Clustering',
|
|
type = "property",
|
|
property = som.cluster$cluster,
|
|
palette.name = topo.colors)
|
|
add.cluster.boundaries(som_model2, som.cluster$cluster)
|
|
|
|
#End timer
|
|
toc() |