ransomware/Final_method.R

254 lines
8.9 KiB
R

##################################################
## Ransomware Detection on the Bitcoin Blockchain
## using Random Forests and Self Organizing Maps
##
## Kaylee Robert Tejeda
## October 31, 2021
##
## Make this header better!!!!
#################################################
# Timer command, uncomment following lines to time script
library(tictoc)
tic(quiet = FALSE)
# Install necessary packages if not already present
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
if(!require(randomForest)) install.packages("randomForest")
if(!require(kohonen)) install.packages("kohonen")
if(!require(parallel)) install.packages("parallel")
# Load Libraries
library(tidyverse)
library(caret)
library(randomForest)
library(kohonen)
library(parallel)
# Download data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip into CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
# Separate into Black and White groups using Random Forests
message("First to separate in to black and white groups.")
#Sample every nth row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
set.seed(23)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)
# Same for test set
test_bw <- test_samp$bw
# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
# Fit model
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
message("Overall accuracy for the test set is ", cm$overall["Accuracy"])
cm
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$bw)
message("Overall accuracy for the validation set is ", cm$overall["Accuracy"])
cm
# From here, trim down set to ONLY the black addresses and apply SOMs...
message("Now we further categorize black address into ransomware families.")
# Try categorical SOMs on black-only addresses....
#!! This is NOT right, is it?
#!! It would be even MORE impressive if I removed all the PREDICTED whites from
#!! the test set instead and started there.
blacks <- ransomware %>% filter(!label=="white")
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
set.seed(23)
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
workset_blacks <- blacks[-test_index,]
temp <- blacks[test_index,]
# Make sure addresses in validation set are also in working set...
# validation <- temp %>%
# semi_join(workset, by = "address")
# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)
# ... Or not
validation_blacks <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
set.seed(5)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
train_set <- workset_blacks[-test_index,]
temp <- workset_blacks[test_index,]
# Make sure addresses in validation set are also in working set....
#test_set <- temp %>%
# semi_join(train_set, by = "address")
# Add rows removed from validation set back into working set....
#removed <- anti_join(temp, test_set)
#train_set <- rbind(train_set, removed)
# ....Or not
test_set <- temp
##!! Data preparation is done, now focusing on Self Organizing Maps as our method
##!! Start here after reworking the data prep steps above.
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(length, weight, count, neighbors, income)
# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Categorical
train_label <- train_set$label %>% classvec2classmat()
# Same for test set
test_label <- test_set$label %>% classvec2classmat()
# Create Data list for supervised SOM
#
train_list <- list(independent = train_mat, dependent = train_label)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed number
set.seed(23)
## Now build the model.
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = detectCores(), # detectCores() - 1 if system locks during calculation
keep.data = TRUE
)
# Now test predictions of test set
# https://clarkdatalabs.github.io/soms/SOM_NBA
test_list <- list(independent = test_mat, dependent = test_label)
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
#table(test_set$label, ransomware_group.prediction$prediction[[2]])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
#cm_labels
# Now test predictions of validation set
# Switching to supervised SOMs
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
valid_label <- validation_blacks$label
valid_list <- list(independent = valid_mat, dependent = valid_label)
ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
# Confusion Matrix
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
message("Overall accuracy for the validation set is ",cm_labels.validation$overall["Accuracy"])
#cm_labels.validation
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
n_groups <- length(unique(ransomware$label)) - 1
n_groups
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
plot(som_model2,
main = 'K-Means Clustering',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)
#End timer
toc()