From b0b78b546cb459e8cafa0cf2dabd8cf6c5a1b500 Mon Sep 17 00:00:00 2001
From: shelldweller <shelldweller@beauxbead.com>
Date: Mon, 18 Oct 2021 23:43:36 -0600
Subject: [PATCH] Final script is done.  Now just flesh it out into the report.
  Skip binary SOMs as they are not needed, although it migh be good to include
 how they were tried and discarded.  Keep them in, but at a limited level? 
 Use THIS final version of the script as the template for the report.  Include
 the deveolopment sequence if it fits.

---
 Detecting_Bitcoin_Ransomware.R | 242 +++++++++++++++++++++++++++++++++
 Final_method.R                 | 182 +++++++++++++------------
 2 files changed, 336 insertions(+), 88 deletions(-)
 create mode 100644 Detecting_Bitcoin_Ransomware.R

diff --git a/Detecting_Bitcoin_Ransomware.R b/Detecting_Bitcoin_Ransomware.R
new file mode 100644
index 0000000..f4f81e1
--- /dev/null
+++ b/Detecting_Bitcoin_Ransomware.R
@@ -0,0 +1,242 @@
+###############################################################################
+## Ransomware Detection on the Bitcoin Blockchain
+## using Random Forests and Self Organizing Maps
+##
+## Kaylee Robert Tejeda
+## October 31, 2021
+##
+## Submitted as part of final CYO project for
+## HarvardX PH125.9x Capstone Course
+## 
+###############################################################################
+
+# Timer command, uncomment following lines to time script
+library(tictoc)
+tic(quiet = FALSE)
+
+# Install necessary packages if not already present
+if(!require(tidyverse)) install.packages("tidyverse")
+if(!require(caret)) install.packages("caret")
+if(!require(randomForest)) install.packages("randomForest")
+if(!require(kohonen)) install.packages("kohonen")
+if(!require(parallel)) install.packages("parallel")
+if(!require(matrixStats)) install.packages("matrixStats")
+
+# Load Libraries
+library(tidyverse)
+library(caret)
+library(randomForest)
+library(kohonen)
+library(parallel)
+library(matrixStats)
+
+# Download data
+url <- 
+  "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
+dest_file <- "data/data.zip"
+if(!dir.exists("data"))dir.create("data")
+if(!file.exists(dest_file))download.file(url, destfile = dest_file)
+
+# Unzip into CSV
+if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, 
+                                                   "BitcoinHeistData.csv", 
+                                                   exdir="data")
+
+# Import data from CSV
+ransomware <- read_csv("data/BitcoinHeistData.csv")
+
+# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
+ransomware <- ransomware %>%
+  mutate(label=as.factor(label), 
+         bw=as.factor(ifelse(label=="white", "white", "black")))
+
+# Validation set made from 50% of BitcoinHeist data
+test_index <- createDataPartition(y = ransomware$bw, 
+                                  times = 1, p = .5, list = FALSE)
+
+workset <- ransomware[-test_index,]
+validation <- ransomware[test_index,]
+
+# Split the working set into a training set and a test set @ 50%
+test_index <- createDataPartition(y = workset$bw,
+                                  times = 1, p = .5, list = FALSE)
+
+train_set <- workset[-test_index,]
+test_set <- workset[test_index,]
+
+###############################################################################
+## Data preparation is now done
+## Separate into "black" and "white" groups using Random Forests predictions
+###############################################################################
+
+# Keep only numeric columns, ignoring temporal features
+ransomware_num <- ransomware %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# Check for variation across numerical columns using coefficients of variation
+#
+# Calculate standard deviations for each column
+sds <- ransomware_num %>% as.matrix() %>% colSds()
+
+# Calculate means for each column
+means <- ransomware_num %>% as.matrix() %>% colMeans()
+
+# Calculate CVs for each column
+coeff_vars <- sds %/% means
+
+#  Select the two features with the highest coefficients of variation
+selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
+
+message("The features with the highest coefficients of variation are ", 
+        selected_features[1], selected_features[2], 
+        ", which will be used to train the binary model.")
+
+#Sample every 100th row due to memory constraints
+train_samp <- train_set[seq(1, nrow(train_set), 100), ]
+
+# Keep only numeric columns with highest coefficients of variation
+train_num <- train_samp %>% select(selected_features[1], selected_features[2])
+
+# Binary outputs, black = ransomware, white = non-ransomware, train set
+train_bw <- train_samp$bw 
+
+#Sample every nth row due to memory constraints
+test_samp <- test_set[seq(1, nrow(train_set), 100), ]
+
+# Dimension reduction again
+test_num <- test_samp %>% select(selected_features[1], selected_features[2])
+
+# Same for test set
+test_bw <- test_samp$bw 
+
+# Lower CV numbers
+control <- trainControl(method="cv", number = 10)
+grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
+
+# Train Random Forests model 
+rf_model <- train(train_num, train_bw, method="rf", 
+                  trControl = control, tuneGrid=grid)
+
+# Fit model
+fit_rf <- randomForest(train_samp, train_bw,
+                       minNode = rf_model$bestTune$mtry)
+
+# Measure accuracy of model against test sample
+y_hat_rf <- predict(fit_rf, test_samp)
+cm <- confusionMatrix(y_hat_rf, test_bw)
+message("Overall accuracy for the binary separation is ",
+        cm$overall["Accuracy"])
+cm
+
+##############################################################################
+## Now we use the Random Forest model to exclude the "white" addresses from
+## the full ransomware set, to categorize the "black" addresses into families.
+message("Now we further categorize black address into ransomware families.")
+##############################################################################
+
+# Measure accuracy of model against full ransomware set
+ransomware_y_hat_rf <- predict(fit_rf, ransomware)
+cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
+message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
+cm
+
+# Now use this prediction to reduce the original set to only "black" addresses
+# First append the full set of predictions to the original set.
+ransomware$predictions <- ransomware_y_hat_rf
+
+# Filter out all the predicted "white" addresses, 
+# leaving only predicted "black" addresses
+black_addresses <- ransomware %>% filter(predictions=="black")
+
+# Split the reduced black-predictions into a training set and a test set @ 50%
+test_index <- createDataPartition(y = black_addresses$predictions,
+                                  times = 1, p = .5, list = FALSE)
+
+train_set <- black_addresses[-test_index,]
+test_set <- black_addresses[test_index,]
+
+
+# Keep only numeric columns, ignoring temporal variables.
+train_num <- train_set %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# SOM function can only work on matrices
+train_mat <- as.matrix(scale(train_num))
+
+# Switching to supervised SOMs
+test_num <- test_set %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# Testing data is scaled according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, 
+                            center = attr(train_mat, "scaled:center"),
+                            scale = attr(train_mat, "scaled:scale")))
+
+# Categorical
+train_label <- train_set$label %>% classvec2classmat()
+
+# Same for test set
+test_label <- test_set$label %>% classvec2classmat()
+
+# Create Data list for supervised SOM
+train_list <- list(independent = train_mat, dependent = train_label)
+
+# Calculate idea grid size according to:
+# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
+
+# Formulaic method 1
+grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
+
+# Based on categorical number, method 2
+#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
+
+grid_size
+
+# Create SOM grid
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, 
+                      topo="hexagonal", toroidal = TRUE)
+
+## Now build the model.
+som_model2 <- xyf(train_mat, train_label,
+                  grid = train_grid, 
+                  rlen = 100,
+                  mode="pbatch", 
+                  cores = detectCores(), 
+                  keep.data = TRUE
+)
+
+# Now test predictions of test set
+
+test_list <- list(independent = test_mat, dependent = test_label)
+
+ransomware_group.prediction <- predict(som_model2, newdata = test_list)
+table(test_set$label, ransomware_group.prediction$prediction[[2]])
+
+# Confusion Matrix
+cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
+                             test_set$label)
+message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
+cm_labels
+
+
+#############################################################################
+## K-Means Clustering to visualize the categorization of the SOM
+## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
+#############################################################################
+
+# Set number of clusters to be equal to number of known ransomware groups
+n_groups <- length(unique(ransomware$label)) - 1
+n_groups
+
+som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
+
+plot(som_model2,
+     main = 'K-Means Clustering',
+     type = "property",
+     property = som.cluster$cluster,
+     palette.name = topo.colors)
+add.cluster.boundaries(som_model2, som.cluster$cluster)
+
+#End timer
+toc()
+
diff --git a/Final_method.R b/Final_method.R
index 0449f59..43b00a8 100644
--- a/Final_method.R
+++ b/Final_method.R
@@ -4,8 +4,10 @@
 ##
 ## Kaylee Robert Tejeda
 ## October 31, 2021
+##
+## Submitted as part of final CYO project for
+## HarvardX PH125.9x Capstone Course
 ## 
-## Make this header better!!!!
 #################################################
 
 # Timer command, uncomment following lines to time script
@@ -18,6 +20,7 @@ if(!require(caret)) install.packages("caret")
 if(!require(randomForest)) install.packages("randomForest")
 if(!require(kohonen)) install.packages("kohonen")
 if(!require(parallel)) install.packages("parallel")
+if(!require(matrixStats)) install.packages("matrixStats")
 
 # Load Libraries
 library(tidyverse)
@@ -25,53 +28,80 @@ library(caret)
 library(randomForest)
 library(kohonen)
 library(parallel)
+library(matrixStats)
 
 # Download data
-url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
+url <- 
+  "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
 dest_file <- "data/data.zip"
 if(!dir.exists("data"))dir.create("data")
 if(!file.exists(dest_file))download.file(url, destfile = dest_file)
 
 # Unzip into CSV
-if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
+if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, 
+                                                   "BitcoinHeistData.csv", 
+                                                   exdir="data")
 
 # Import data from CSV
 ransomware <- read_csv("data/BitcoinHeistData.csv")
 
 # Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
-ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
+ransomware <- ransomware %>%
+  mutate(label=as.factor(label), 
+         bw=as.factor(ifelse(label=="white", "white", "black")))
 
-# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
-test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
+# Validation set made from 50% of BitcoinHeist data
+test_index <- createDataPartition(y = ransomware$bw, 
+                                  times = 1, p = .5, list = FALSE)
 
 workset <- ransomware[-test_index,]
 validation <- ransomware[test_index,]
 
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
-test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
+# Split the working set into a training set and a test set @ 50%,
+test_index <- createDataPartition(y = workset$bw,
+                                  times = 1, p = .5, list = FALSE)
 
 train_set <- workset[-test_index,]
 test_set <- workset[test_index,]
 
-# Separate into Black and White groups using Random Forests
+## Separate into "black" and "white" groups using Random Forests
 
-message("First to separate in to black and white groups.")
+# Keep only numeric columns, ignoring temporal features
+ransomware_num <- ransomware %>% 
+  select(length, weight, count, looped, neighbors, income)
 
-#Sample every nth row due to memory constraints
+# Check for variation across numerical columns using coefficients of variation
+
+# Calculate standard deviations for each column
+sds <- ransomware_num %>% as.matrix() %>% colSds()
+
+# Calculate means for each column
+means <- ransomware_num %>% as.matrix() %>% colMeans()
+
+# Calculate CVs for each column
+coeff_vars <- sds %/% means
+
+#  Select the two features with the highest coefficients of variation
+selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
+
+message("The features with the highest coefficients of variation are ", 
+        selected_features[1], selected_features[2], 
+        ", which will be used to train the binary model.")
+
+#Sample every 100th row due to memory constraints
 train_samp <- train_set[seq(1, nrow(train_set), 100), ]
 
-# Keep only numeric columns with highest coefficients of variation for dimension reduction
-train_num <- train_samp %>% select(neighbors, income)
+# Keep only numeric columns with highest coefficients of variation
+train_num <- train_samp %>% select(selected_features[1], selected_features[2])
 
-# Binary outputs, black=ransomware, white=non-ransomware, train set
+# Binary outputs, black = ransomware, white = non-ransomware, train set
 train_bw <- train_samp$bw 
 
 #Sample every nth row due to memory constraints
-set.seed(23)
 test_samp <- test_set[seq(1, nrow(train_set), 100), ]
 
 # Dimension reduction again
-test_num <- test_samp %>% select(neighbors, income)
+test_num <- test_samp %>% select(selected_features[1], selected_features[2])
 
 # Same for test set
 test_bw <- test_samp$bw 
@@ -81,7 +111,8 @@ control <- trainControl(method="cv", number = 10)
 grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
 
 # Train Random Forests model 
-rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
+rf_model <- train(train_num, train_bw, method="rf", 
+                  trControl = control, tuneGrid=grid)
 
 # Fit model
 fit_rf <- randomForest(train_samp, train_bw,
@@ -90,81 +121,50 @@ fit_rf <- randomForest(train_samp, train_bw,
 # Measure accuracy of model against test sample
 y_hat_rf <- predict(fit_rf, test_samp)
 cm <- confusionMatrix(y_hat_rf, test_bw)
-message("Overall accuracy for the test set is ", cm$overall["Accuracy"])
+message("Overall accuracy for the binary separation is ",
+        cm$overall["Accuracy"])
 cm
 
-# Measure accuracy of model against full validation set
-
-y_hat_rf <- predict(fit_rf, validation)
-cm <- confusionMatrix(y_hat_rf, validation$bw)
-message("Overall accuracy for the validation set is ", cm$overall["Accuracy"])
-cm
 
 # From here, trim down set to ONLY the black addresses and apply SOMs...
 
 message("Now we further categorize black address into ransomware families.")
 
-# Try categorical SOMs on black-only addresses....
-#!! This is NOT right, is it? 
-#!! It would be even MORE impressive if I removed all the PREDICTED whites from 
-#!! the test set instead and started there.
+# Measure accuracy of model against full ransomware set
+ransomware_y_hat_rf <- predict(fit_rf, ransomware)
+cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
+message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
+cm
 
-blacks <- ransomware %>% filter(!label=="white")
+# Now use this prediction to reduce the original set to only "black" addresses
 
-# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
-set.seed(23)
-test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
+ransomware$predictions <- ransomware_y_hat_rf
 
-workset_blacks <- blacks[-test_index,]
-temp <- blacks[test_index,]
+black_addresses <- ransomware %>% filter(predictions=="black")
 
-# Make sure addresses in validation set are also in working set...
-# validation <- temp %>% 
-#  semi_join(workset, by = "address")
+# Split the reduced black-predictions into a training set and a test set @ 50%
+test_index <- createDataPartition(y = black_addresses$predictions,
+                                  times = 1, p = .5, list = FALSE)
 
-# Add rows removed from validation set back into working set...
-#removed <- anti_join(temp, validation)
-#workset <- rbind(workset, removed)
+train_set <- workset[-test_index,]
+test_set <- workset[test_index,]
 
-# ... Or not
-validation_blacks <- temp
 
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
-set.seed(5)
-test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
-
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
-#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
-
-train_set <- workset_blacks[-test_index,]
-temp <- workset_blacks[test_index,]
-
-# Make sure addresses in validation set are also in working set....
-#test_set <- temp %>% 
-#  semi_join(train_set, by = "address")
-
-# Add rows removed from validation set back into working set....
-#removed <- anti_join(temp, test_set)
-#train_set <- rbind(train_set, removed)
-
-# ....Or not
-test_set <- temp
-
-##!! Data preparation is done, now focusing on Self Organizing Maps as our method
-##!! Start here after reworking the data prep steps above.
-
-# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
-train_num <- train_set %>% select(length, weight, count, neighbors, income)
+# Keep only numeric columns, ignoring temporal variables.
+train_num <- train_set %>% 
+  select(length, weight, count, looped, neighbors, income)
 
 # SOM function can only work on matrices
 train_mat <- as.matrix(scale(train_num))
 
 # Switching to supervised SOMs
-test_num <- test_set %>% select(length, weight, count, neighbors, income)
+test_num <- test_set %>% 
+  select(length, weight, count, looped, neighbors, income)
 
-# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
-test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
-                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+# Testing data is scaled according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, 
+                            center = attr(train_mat, "scaled:center"),
+                            scale = attr(train_mat, "scaled:scale")))
 
 # Categorical
 train_label <- train_set$label %>% classvec2classmat()
@@ -186,17 +186,15 @@ grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
 grid_size
 
 # Create SOM grid
-train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
-
-# Set magic seed number
-set.seed(23)
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, 
+                      topo="hexagonal", toroidal = TRUE)
 
 ## Now build the model.
 som_model2 <- xyf(train_mat, train_label,
                   grid = train_grid, 
                   rlen = 100,
-                  mode="pbatch", # or: alpha = c(0.05,0.01),
-                  cores = detectCores(), # detectCores() - 1 if system locks during calculation
+                  mode="pbatch", 
+                  cores = detectCores(), 
                   keep.data = TRUE
 )
 
@@ -209,32 +207,40 @@ ransomware_group.prediction <- predict(som_model2, newdata = test_list)
 #table(test_set$label, ransomware_group.prediction$prediction[[2]])
 
 # Confusion Matrix
-cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
+cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
+                             test_set$label)
 message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
 #cm_labels
 
 # Now test predictions of validation set
 
 # Switching to supervised SOMs
-valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
+valid_num <- validation_blacks %>%
+  select(length, weight, count, looped, neighbors, income)
 
-# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
-valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
-                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+# Validation data is scaled using center and spread of training data.
+valid_mat <- as.matrix(scale(valid_num, 
+                             center = attr(train_mat, "scaled:center"), 
+                             scale = attr(train_mat, "scaled:scale")))
 
 valid_label <- validation_blacks$label
 
 valid_list <- list(independent = valid_mat, dependent = valid_label)
 
-ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
-#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
+ransomware_group.prediction.validation <- predict(som_model2, 
+                                                  newdata = valid_list)
+#table(validation_blacks$label, 
+#ransomware_group.prediction.validation$prediction[[2]])
 
 # Confusion Matrix
-cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
-message("Overall accuracy for the validation set is ",cm_labels.validation$overall["Accuracy"])
+cm_labels.validation <- 
+  confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], 
+                  validation_blacks$label)
+message("Overall accuracy for the validation set is ", 
+        cm_labels.validation$overall["Accuracy"])
 #cm_labels.validation
 
-# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
+# Set number of clusters to be equal to number of known ransomware groups
 n_groups <- length(unique(ransomware$label)) - 1
 n_groups