###############################################################################
## Ransomware Detection on the Bitcoin Blockchain
## using Random Forests and Self Organizing Maps
##
## Kaylee Robert Tejeda
## November 11, 2021
##
## Submitted as part of final CYO project for
## HarvardX PH125.9x Capstone Course
## 
###############################################################################

# Uncomment next line to time script
#tic()

# Set the repository mirror to “1: 0-Cloud” for maximum availability
r = getOption("repos") 
r["CRAN"] = "http://cran.rstudio.com"
options(repos = r)
rm(r)

# Install necessary packages if not already present
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
if(!require(randomForest)) install.packages("randomForest")
if(!require(kohonen)) install.packages("kohonen")
if(!require(parallel)) install.packages("parallel")
if(!require(matrixStats)) install.packages("matrixStats")

# Load Libraries
library(tidyverse)
library(caret)
library(randomForest)
library(kohonen)
library(parallel)
library(matrixStats)

# Download data
url <- 
  "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)

# Unzip as CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, 
                                                   "BitcoinHeistData.csv", 
                                                   exdir="data")

# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")

# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
ransomware <- ransomware %>%
  mutate(label=as.factor(label), 
         bw=as.factor(ifelse(label=="white", "white", "black")))

# Validation set made from 50% of BitcoinHeist data, for RAM considerations
test_index <- createDataPartition(y = ransomware$bw, 
                                  times = 1, p = .5, list = FALSE)

workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]

# Split the working set into a training set and a test set @ 50%, RAM dictated
test_index <- createDataPartition(y = workset$bw,
                                  times = 1, p = .5, list = FALSE)

train_set <- workset[-test_index,]
test_set <- workset[test_index,]

###############################################################################
## Data preparation is now done
## Separate into "black" and "white" groups using Random Forests predictions
###############################################################################

# Keep only numeric columns, ignoring temporal features
ransomware_num <- ransomware %>% 
  select(length, weight, count, looped, neighbors, income)

# Check for variation across numerical columns using coefficients of variation
#
# Calculate standard deviations for each column
sds <- ransomware_num %>% as.matrix() %>% colSds()

# Calculate means for each column
means <- ransomware_num %>% as.matrix() %>% colMeans()

# Calculate CVs for each column
coeff_vars <- sds %/% means

#  Select the two features with the highest coefficients of variation
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]

message("The features with the highest coefficients of variation are ", 
        selected_features[1], " and ", selected_features[2], 
        ", which will be used to train the binary model.")

# Sample every 100th row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]

# Keep only numeric columns with highest coefficients of variation
train_num <- train_samp %>% select(selected_features[1], selected_features[2])

# Binary labels, black = ransomware, white = non-ransomware, train set
train_bw <- train_samp$bw 

#Sample every 100th row due to memory constraints to make test sample same size
test_samp <- test_set[seq(1, nrow(train_set), 100), ]

# Dimension reduction again, selecting features with highest CVs
test_num <- test_samp %>% select(selected_features[1], selected_features[2])

# Binary labels for test set 
test_bw <- test_samp$bw 

# Cross Validation, ten fold
control <- trainControl(method="cv", number = 10)

# Control grid with variation on mtry
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))

# Run Cross Validation using control and grid set above
rf_model <- train(train_num, train_bw, method="rf", 
                  trControl = control, tuneGrid=grid)

# Supervised fit of model using cross validated optimization
fit_rf <- randomForest(train_samp, train_bw,
                       minNode = rf_model$bestTune$mtry)

# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm_test <- confusionMatrix(y_hat_rf, test_bw)
message("Overall accuracy for the binary separation is ",
        cm_test$overall["Accuracy"])
cm_test

# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
message("Overall accuracy for the full data set is ",
        cm_ransomware$overall["Accuracy"])
cm_ransomware

##############################################################################
## Now we use the Random Forest model to exclude the "white" addresses from
## the full ransomware set, to categorize the "black" addresses into families.
##############################################################################

# Now use this prediction to reduce the original set to only "black" addresses
# First append the full set of predictions to the original set
ransomware$prediction <- ransomware_y_hat_rf

# Filter out all the predicted "white" addresses, 
# leaving only predicted "black" addresses
black_addresses <- ransomware %>% filter(prediction=="black")

# Split the reduced black-predictions into a training set and a test set @ 50%
test_index <- createDataPartition(y = black_addresses$prediction,
                                  times = 1, p = .5, list = FALSE)

train_set <- black_addresses[-test_index,]
test_set <- black_addresses[test_index,]

# Keep only numeric columns, ignoring temporal variables
train_num <- train_set %>% 
  select(length, weight, count, looped, neighbors, income)

# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))

# Select non-temporal numerical features only
test_num <- test_set %>% 
  select(length, weight, count, looped, neighbors, income)

# Testing data is scaled according to how we scaled our training data
test_mat <- as.matrix(scale(test_num, 
                            center = attr(train_mat, "scaled:center"),
                            scale = attr(train_mat, "scaled:scale")))

# Categorical labels for training set
train_label <- train_set$label %>% classvec2classmat()

# Same for test set
test_label <- test_set$label %>% classvec2classmat()

# Create data list for supervised SOM
train_list <- list(independent = train_mat, dependent = train_label)

# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps

# Formulaic method 1, makes a larger graph in this case
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))

# Based on categorical number, method 2, smaller graph with less cells
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))

message("A grid size of ", grid_size, " has been chosen.")

# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, 
                      topo="hexagonal", toroidal = TRUE)

## Now build the SOM model using the supervised method xyf()
som_model2 <- xyf(train_mat, train_label,
                  grid = train_grid, 
                  rlen = 100,
                  mode="pbatch", 
                  cores = detectCores(), # Use all cores
                  # cores = detectCores() - 1,  # Leave one core for system
                  keep.data = TRUE
)

# Now test predictions of test set, create data list for test set
test_list <- list(independent = test_mat, dependent = test_label)

# Generate predictions
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
table(test_set$label, ransomware_group.prediction$prediction[[2]])

# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
                             test_set$label)
cm_labels

#############################################################################
## K-Means Clustering to visualize the categorization of the SOM
## For a good tutorial, visit:
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
#############################################################################

# Set number of clusters to be equal to number of known ransomware groups
n_groups <- length(unique(ransomware$label)) - 1

# Generate k-means clustering
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)

# Plot clustering results
plot(som_model2,
     main = 'K-Means Clustering',
     type = "property",
     property = som.cluster$cluster,
     palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)

message("Overall accuracy is ", cm_labels$overall["Accuracy"])

# End timer
#toc(quiet=FALSE)