Final script is done. Now just flesh it out into the report. Skip binary SOMs as they are not needed, although it migh be good to include how they were tried and discarded. Keep them in, but at a limited level? Use THIS final version of the script as the template for the report. Include the deveolopment sequence if it fits.

This commit is contained in:
shelldweller 2021-10-18 23:43:36 -06:00
parent be6bd62bbc
commit b0b78b546c
2 changed files with 336 additions and 88 deletions

View File

@ -0,0 +1,242 @@
###############################################################################
## Ransomware Detection on the Bitcoin Blockchain
## using Random Forests and Self Organizing Maps
##
## Kaylee Robert Tejeda
## October 31, 2021
##
## Submitted as part of final CYO project for
## HarvardX PH125.9x Capstone Course
##
###############################################################################
# Timer command, uncomment following lines to time script
library(tictoc)
tic(quiet = FALSE)
# Install necessary packages if not already present
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
if(!require(randomForest)) install.packages("randomForest")
if(!require(kohonen)) install.packages("kohonen")
if(!require(parallel)) install.packages("parallel")
if(!require(matrixStats)) install.packages("matrixStats")
# Load Libraries
library(tidyverse)
library(caret)
library(randomForest)
library(kohonen)
library(parallel)
library(matrixStats)
# Download data
url <-
"https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip into CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
"BitcoinHeistData.csv",
exdir="data")
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
ransomware <- ransomware %>%
mutate(label=as.factor(label),
bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data
test_index <- createDataPartition(y = ransomware$bw,
times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%
test_index <- createDataPartition(y = workset$bw,
times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
###############################################################################
## Data preparation is now done
## Separate into "black" and "white" groups using Random Forests predictions
###############################################################################
# Keep only numeric columns, ignoring temporal features
ransomware_num <- ransomware %>%
select(length, weight, count, looped, neighbors, income)
# Check for variation across numerical columns using coefficients of variation
#
# Calculate standard deviations for each column
sds <- ransomware_num %>% as.matrix() %>% colSds()
# Calculate means for each column
means <- ransomware_num %>% as.matrix() %>% colMeans()
# Calculate CVs for each column
coeff_vars <- sds %/% means
# Select the two features with the highest coefficients of variation
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
message("The features with the highest coefficients of variation are ",
selected_features[1], selected_features[2],
", which will be used to train the binary model.")
#Sample every 100th row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns with highest coefficients of variation
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
# Binary outputs, black = ransomware, white = non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
# Same for test set
test_bw <- test_samp$bw
# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf",
trControl = control, tuneGrid=grid)
# Fit model
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
message("Overall accuracy for the binary separation is ",
cm$overall["Accuracy"])
cm
##############################################################################
## Now we use the Random Forest model to exclude the "white" addresses from
## the full ransomware set, to categorize the "black" addresses into families.
message("Now we further categorize black address into ransomware families.")
##############################################################################
# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
cm
# Now use this prediction to reduce the original set to only "black" addresses
# First append the full set of predictions to the original set.
ransomware$predictions <- ransomware_y_hat_rf
# Filter out all the predicted "white" addresses,
# leaving only predicted "black" addresses
black_addresses <- ransomware %>% filter(predictions=="black")
# Split the reduced black-predictions into a training set and a test set @ 50%
test_index <- createDataPartition(y = black_addresses$predictions,
times = 1, p = .5, list = FALSE)
train_set <- black_addresses[-test_index,]
test_set <- black_addresses[test_index,]
# Keep only numeric columns, ignoring temporal variables.
train_num <- train_set %>%
select(length, weight, count, looped, neighbors, income)
# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>%
select(length, weight, count, looped, neighbors, income)
# Testing data is scaled according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num,
center = attr(train_mat, "scaled:center"),
scale = attr(train_mat, "scaled:scale")))
# Categorical
train_label <- train_set$label %>% classvec2classmat()
# Same for test set
test_label <- test_set$label %>% classvec2classmat()
# Create Data list for supervised SOM
train_list <- list(independent = train_mat, dependent = train_label)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
topo="hexagonal", toroidal = TRUE)
## Now build the model.
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch",
cores = detectCores(),
keep.data = TRUE
)
# Now test predictions of test set
test_list <- list(independent = test_mat, dependent = test_label)
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
table(test_set$label, ransomware_group.prediction$prediction[[2]])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
test_set$label)
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
cm_labels
#############################################################################
## K-Means Clustering to visualize the categorization of the SOM
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
#############################################################################
# Set number of clusters to be equal to number of known ransomware groups
n_groups <- length(unique(ransomware$label)) - 1
n_groups
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
plot(som_model2,
main = 'K-Means Clustering',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)
#End timer
toc()

View File

@ -4,8 +4,10 @@
##
## Kaylee Robert Tejeda
## October 31, 2021
##
## Submitted as part of final CYO project for
## HarvardX PH125.9x Capstone Course
##
## Make this header better!!!!
#################################################
# Timer command, uncomment following lines to time script
@ -18,6 +20,7 @@ if(!require(caret)) install.packages("caret")
if(!require(randomForest)) install.packages("randomForest")
if(!require(kohonen)) install.packages("kohonen")
if(!require(parallel)) install.packages("parallel")
if(!require(matrixStats)) install.packages("matrixStats")
# Load Libraries
library(tidyverse)
@ -25,53 +28,80 @@ library(caret)
library(randomForest)
library(kohonen)
library(parallel)
library(matrixStats)
# Download data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
url <-
"https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip into CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
"BitcoinHeistData.csv",
exdir="data")
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
ransomware <- ransomware %>%
mutate(label=as.factor(label),
bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
# Validation set made from 50% of BitcoinHeist data
test_index <- createDataPartition(y = ransomware$bw,
times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%,
test_index <- createDataPartition(y = workset$bw,
times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
# Separate into Black and White groups using Random Forests
## Separate into "black" and "white" groups using Random Forests
message("First to separate in to black and white groups.")
# Keep only numeric columns, ignoring temporal features
ransomware_num <- ransomware %>%
select(length, weight, count, looped, neighbors, income)
#Sample every nth row due to memory constraints
# Check for variation across numerical columns using coefficients of variation
# Calculate standard deviations for each column
sds <- ransomware_num %>% as.matrix() %>% colSds()
# Calculate means for each column
means <- ransomware_num %>% as.matrix() %>% colMeans()
# Calculate CVs for each column
coeff_vars <- sds %/% means
# Select the two features with the highest coefficients of variation
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
message("The features with the highest coefficients of variation are ",
selected_features[1], selected_features[2],
", which will be used to train the binary model.")
#Sample every 100th row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)
# Keep only numeric columns with highest coefficients of variation
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
# Binary outputs, black=ransomware, white=non-ransomware, train set
# Binary outputs, black = ransomware, white = non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
set.seed(23)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
# Same for test set
test_bw <- test_samp$bw
@ -81,7 +111,8 @@ control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
rf_model <- train(train_num, train_bw, method="rf",
trControl = control, tuneGrid=grid)
# Fit model
fit_rf <- randomForest(train_samp, train_bw,
@ -90,81 +121,50 @@ fit_rf <- randomForest(train_samp, train_bw,
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
message("Overall accuracy for the test set is ", cm$overall["Accuracy"])
message("Overall accuracy for the binary separation is ",
cm$overall["Accuracy"])
cm
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$bw)
message("Overall accuracy for the validation set is ", cm$overall["Accuracy"])
cm
# From here, trim down set to ONLY the black addresses and apply SOMs...
message("Now we further categorize black address into ransomware families.")
# Try categorical SOMs on black-only addresses....
#!! This is NOT right, is it?
#!! It would be even MORE impressive if I removed all the PREDICTED whites from
#!! the test set instead and started there.
# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
cm
blacks <- ransomware %>% filter(!label=="white")
# Now use this prediction to reduce the original set to only "black" addresses
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
set.seed(23)
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
ransomware$predictions <- ransomware_y_hat_rf
workset_blacks <- blacks[-test_index,]
temp <- blacks[test_index,]
black_addresses <- ransomware %>% filter(predictions=="black")
# Make sure addresses in validation set are also in working set...
# validation <- temp %>%
# semi_join(workset, by = "address")
# Split the reduced black-predictions into a training set and a test set @ 50%
test_index <- createDataPartition(y = black_addresses$predictions,
times = 1, p = .5, list = FALSE)
# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
# ... Or not
validation_blacks <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
set.seed(5)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
train_set <- workset_blacks[-test_index,]
temp <- workset_blacks[test_index,]
# Make sure addresses in validation set are also in working set....
#test_set <- temp %>%
# semi_join(train_set, by = "address")
# Add rows removed from validation set back into working set....
#removed <- anti_join(temp, test_set)
#train_set <- rbind(train_set, removed)
# ....Or not
test_set <- temp
##!! Data preparation is done, now focusing on Self Organizing Maps as our method
##!! Start here after reworking the data prep steps above.
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(length, weight, count, neighbors, income)
# Keep only numeric columns, ignoring temporal variables.
train_num <- train_set %>%
select(length, weight, count, looped, neighbors, income)
# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)
test_num <- test_set %>%
select(length, weight, count, looped, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Testing data is scaled according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num,
center = attr(train_mat, "scaled:center"),
scale = attr(train_mat, "scaled:scale")))
# Categorical
train_label <- train_set$label %>% classvec2classmat()
@ -186,17 +186,15 @@ grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed number
set.seed(23)
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
topo="hexagonal", toroidal = TRUE)
## Now build the model.
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = detectCores(), # detectCores() - 1 if system locks during calculation
mode="pbatch",
cores = detectCores(),
keep.data = TRUE
)
@ -209,32 +207,40 @@ ransomware_group.prediction <- predict(som_model2, newdata = test_list)
#table(test_set$label, ransomware_group.prediction$prediction[[2]])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
test_set$label)
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
#cm_labels
# Now test predictions of validation set
# Switching to supervised SOMs
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
valid_num <- validation_blacks %>%
select(length, weight, count, looped, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Validation data is scaled using center and spread of training data.
valid_mat <- as.matrix(scale(valid_num,
center = attr(train_mat, "scaled:center"),
scale = attr(train_mat, "scaled:scale")))
valid_label <- validation_blacks$label
valid_list <- list(independent = valid_mat, dependent = valid_label)
ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
ransomware_group.prediction.validation <- predict(som_model2,
newdata = valid_list)
#table(validation_blacks$label,
#ransomware_group.prediction.validation$prediction[[2]])
# Confusion Matrix
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
message("Overall accuracy for the validation set is ",cm_labels.validation$overall["Accuracy"])
cm_labels.validation <-
confusionMatrix(ransomware_group.prediction.validation$prediction[[2]],
validation_blacks$label)
message("Overall accuracy for the validation set is ",
cm_labels.validation$overall["Accuracy"])
#cm_labels.validation
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
# Set number of clusters to be equal to number of known ransomware groups
n_groups <- length(unique(ransomware$label)) - 1
n_groups