The script is complete. The report has all the code from the script, and it compiles. I still need to add the textual parts for chunks 3 and 4. Final step is still the visuals, although I am thinking less is more on that one.

This commit is contained in:
shelldweller 2021-10-20 00:33:26 -06:00
parent b0b78b546c
commit b1c41d674c
3 changed files with 235 additions and 454 deletions

View File

@ -14,6 +14,12 @@
library(tictoc)
tic(quiet = FALSE)
# Set the repository
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
rm(r)
# Install necessary packages if not already present
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
@ -37,7 +43,7 @@ dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip into CSV
# Unzip as CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
"BitcoinHeistData.csv",
exdir="data")
@ -50,25 +56,25 @@ ransomware <- ransomware %>%
mutate(label=as.factor(label),
bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data
# Validation set made from 50% of BitcoinHeist data, for RAM considerations
test_index <- createDataPartition(y = ransomware$bw,
times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%
# Split the working set into a training set and a test set @ 50%, RAM dictated
test_index <- createDataPartition(y = workset$bw,
times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
###############################################################################
## Data preparation is now done
## Separate into "black" and "white" groups using Random Forests predictions
###############################################################################
# Keep only numeric columns, ignoring temporal features
ransomware_num <- ransomware %>%
select(length, weight, count, looped, neighbors, income)
@ -97,59 +103,61 @@ train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns with highest coefficients of variation
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
# Binary outputs, black = ransomware, white = non-ransomware, train set
# Binary labels, black = ransomware, white = non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
#Sample every 100th row due to memory constraints to make test sample same size.
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
# Dimension reduction again, selecting features with highest CVs
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
# Same for test set
# Binary labels for test set
test_bw <- test_samp$bw
# Lower CV numbers
# Cross Validation, ten fold
control <- trainControl(method="cv", number = 10)
# Control grid with variation on mtry
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
# Run Cross Validation using control and grid set above
rf_model <- train(train_num, train_bw, method="rf",
trControl = control, tuneGrid=grid)
# Fit model
# Supervised fit of model using cross validated optimization
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
cm_test <- confusionMatrix(y_hat_rf, test_bw)
message("Overall accuracy for the binary separation is ",
cm$overall["Accuracy"])
cm
cm_test$overall["Accuracy"])
cm_test
# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
message("Overall accuracy for the full data set is ",
cm_ransomware$overall["Accuracy"])
cm_ransomware
##############################################################################
## Now we use the Random Forest model to exclude the "white" addresses from
## the full ransomware set, to categorize the "black" addresses into families.
message("Now we further categorize black address into ransomware families.")
##############################################################################
# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
cm
# Now use this prediction to reduce the original set to only "black" addresses
# First append the full set of predictions to the original set.
ransomware$predictions <- ransomware_y_hat_rf
ransomware$prediction <- ransomware_y_hat_rf
# Filter out all the predicted "white" addresses,
# leaving only predicted "black" addresses
black_addresses <- ransomware %>% filter(predictions=="black")
black_addresses <- ransomware %>% filter(prediction=="black")
# Split the reduced black-predictions into a training set and a test set @ 50%
test_index <- createDataPartition(y = black_addresses$predictions,
test_index <- createDataPartition(y = black_addresses$prediction,
times = 1, p = .5, list = FALSE)
train_set <- black_addresses[-test_index,]
@ -160,10 +168,10 @@ test_set <- black_addresses[test_index,]
train_num <- train_set %>%
select(length, weight, count, looped, neighbors, income)
# SOM function can only work on matrices
# SOM function can only work on matrices.
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
# Select non-temporal numerical features only
test_num <- test_set %>%
select(length, weight, count, looped, neighbors, income)
@ -172,64 +180,65 @@ test_mat <- as.matrix(scale(test_num,
center = attr(train_mat, "scaled:center"),
scale = attr(train_mat, "scaled:scale")))
# Categorical
# Categorical labels for training set
train_label <- train_set$label %>% classvec2classmat()
# Same for test set
test_label <- test_set$label %>% classvec2classmat()
# Create Data list for supervised SOM
# Create data list for supervised SOM
train_list <- list(independent = train_mat, dependent = train_label)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
# Formulaic method 1, makes a larger graph in this case
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
# Based on categorical number, method 2, smaller graph with less cells
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size
message("A grid size of ", grid_size, " has been chosen.")
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
topo="hexagonal", toroidal = TRUE)
## Now build the model.
## Now build the SOM model using the supervised method xyf()
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch",
cores = detectCores(),
cores = detectCores(), # Use all cores
# cores = detectCores() - 1, # Leave one core for system
keep.data = TRUE
)
# Now test predictions of test set
# Now test predictions of test set, create data list for test set
test_list <- list(independent = test_mat, dependent = test_label)
# Generate predictions
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
table(test_set$label, ransomware_group.prediction$prediction[[2]])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
test_set$label)
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
cm_labels
#############################################################################
## K-Means Clustering to visualize the categorization of the SOM
## For a good tutorial, visit:
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
#############################################################################
# Set number of clusters to be equal to number of known ransomware groups
n_groups <- length(unique(ransomware$label)) - 1
n_groups
# Generate k-means clustering
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
# Plot clustering results
plot(som_model2,
main = 'K-Means Clustering',
type = "property",
@ -237,6 +246,9 @@ plot(som_model2,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)
message("Overall accuracy is ", cm_labels$overall["Accuracy"])
#End timer
toc()

View File

@ -1,6 +1,6 @@
---
title: \vspace{1in}Detecting Ransomware Addresses on the Bitcoin Blockchain using Random Forests and Self Organizing Maps
subtitle: \vspace{.5in}HarvardX Final Capstone CYO Project
subtitle: \vspace{.5in}HarvardX PH125.9x Final Capstone CYO Project
\vspace{.5in}
author: "Kaylee Robert Tejeda"
date: "10/31/2021"
@ -79,31 +79,41 @@ These variables are defined rather abstractly, viewing the blockchain as a topol
This data set was discovered while exploring the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)$^{[4]}$ as suggested in the project instructions. The author of this report, interested in Bitcoin and other cryptocurrencies since (unsuccessfully) mining them on an ASUS netbook in rural Peru in late 2010, used *cryptocurrency* as a preliminary search term. This brought up a single data set entitled ["BitcoinHeist: Ransomware Address Data Set"](https://archive.ics.uci.edu/ml/datasets/BitcoinHeistRansomwareAddressDataset#). The data set was downloaded and the exploration began.
```{r data-prep, echo=FALSE, include=FALSE}
```{r install-load-libraries&download-data, echo=FALSE, include=FALSE}
# Set the repository to a known working mirror just in case it has not already been set
cat("Setting Seattle repository")
# Set the repository
r = getOption("repos")
r["CRAN"] = "http://cran.fhcrc.org/"
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
rm(r)
# Install necessary packages
# Install necessary packages if not already present
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
if(!require(randomForest)) install.packages("randomForest")
if(!require(kohonen)) install.packages("kohonen")
if(!require(parallel)) install.packages("parallel")
if(!require(matrixStats)) install.packages("matrixStats")
# Load Libraries
library(tidyverse)
library(caret)
library(randomForest)
library(kohonen)
library(parallel)
library(matrixStats)
# Download data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
url <-
"https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
# Unzip as CSV
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
"BitcoinHeistData.csv",
exdir="data")
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
@ -171,57 +181,80 @@ The original research team downloaded and parsed the entire Bitcoin transaction
It is immediately apparent that this is a rather large data set. The usual practice of partitioning out eighty to ninety percent of the data for a training set results in a data set that is too large to process given the hardware available. For reasons that no longer apply, the original data set was first split in half with 50% reserved as "validation set" and the other 50% used as the "working set". This working set was again split in half, to give a "training set" that was of a reasonable size to deal with. At this point the partitions were small enough to work with, so the sample partitions were not further refined. This is a potential area for later optimization. Careful sampling was carried out to ensure that the ransomware groups were represented in each sample.
```{r dataprep, echo=FALSE, include=FALSE}
```{r data-prep, echo=FALSE, include=FALSE}
# ?? Cluster graphs go at the end.
# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
ransomware <- ransomware %>%
mutate(label=as.factor(label),
bw=as.factor(ifelse(label=="white", "white", "black")))
# Install foreach package if needed
if(!require(matrixStats)) install.packages("matrixStats")
# Load foreach library
library(matrixStats)
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
# Validation set made from 50% of BitcoinHeist data, for RAM considerations
test_index <- createDataPartition(y = ransomware$bw,
times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, RAM dictated
test_index <- createDataPartition(y = workset$bw,
times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
#Sample every nth row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
# Keep only numeric columns
train_scaled <- train_num %>% scale()
# Find proportion of full data set that is ransomware
ransomprop <- mean(ransomware$bw=="black")
# Check for NAs
no_nas <- sum(is.na(ransomware))
```
### Exploration and Visualization (do this part last....)
### Exploration and Visualization ( Chunk #2, do this part last....)
The ransomware addresses make up less than 2% of the overall data set. This presents a challenge as the target observations are sparse within the data set, especially when we consider that this is then divided into 29 subsets. In fact, some of the ransomware groups have only a single member, making categorization a dubious task. At least there are no missing values to worry about.
```{r cv-calcs, echo=FALSE}
# Keep only numeric columns, ignoring temporal features
ransomware_num <- ransomware %>%
select(length, weight, count, looped, neighbors, income)
# Check for variation across numerical columns using coefficients of variation
#
# Calculate standard deviations for each column
sds <- ransomware_num %>% as.matrix() %>% colSds()
# Calculate means for each column
means <- ransomware_num %>% as.matrix() %>% colMeans()
# Calculate CVs for each column
coeff_vars <- sds %/% means
# Select the two features with the highest coefficients of variation
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
#Sample every 100th row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# Keep only numeric columns with highest coefficients of variation
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
# Binary labels, black = ransomware, white = non-ransomware, train set
train_bw <- train_samp$bw
#Sample every 100th row due to memory constraints to make test sample same size.
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again, selecting features with highest CVs
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
# Binary labels for test set
test_bw <- test_samp$bw
```
```{r data-sparsness, echo=FALSE}
@ -238,7 +271,6 @@ knitr::kable(
)
```
Let's take a look at the distribution of the different features. Note how skewed the non-temporal features are, some of them being bimodal:
@ -271,17 +303,12 @@ ggp2
Now we can compare the relative spread of each feature by calculating the coefficient of variation for each column. Larger coefficients of variation indicate larger relative spread compared to other columns.
```{r sds, echo=FALSE}
# Check for variability across numerical columns using coefficients of variation
# Calculate standard deviations for each column
sds <- train_num %>% as.matrix() %>% colSds()
```{r cv-results, echo=FALSE}
# Calculate means for each column
means <- train_num %>% as.matrix() %>% colMeans()
# Calculate CVs for each column
coeff_vars <- sds %/% means
message("The features with the highest coefficients of variation are ",
selected_features[1], selected_features[2],
", which will be used to train the binary model.")
# Summarize results in a table and a plot
knitr::kable(coeff_vars)
@ -293,22 +320,25 @@ plot(coeff_vars)
From this, it appears that *income* has the widest range of variability, followed by *neighbors*. These are also the features that are most strongly skewed to the right, meaning that a few addresses have really high values for each of these features while the bulk of the data set has very low values for these numbers.
Now do the following (after filling in methods, results, and conclusions, since those are done already:
Now do the following (after filling in methods, results, and conclusions, since those are done already):
6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. MORE OF THIS....
```{r percent per column, echo=FALSE}
# Do this here
```{r shrimp-percentage, echo=FALSE, include=FALSE}
# Count how many wallets have less than one full bitcoin
shrimp <- train_samp %>% filter(income < 10^8 )
mean(shrimp$bw == "black")
```
```{r shrimp-output, echo=FALSE}
# Print the percentage of wallets with less than one full bitcoin
mean(shrimp$bw == "black")
```
7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!)
@ -320,6 +350,8 @@ mean(shrimp$bw == "black")
#d <- dist(x)
#image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue
train_scaled <- scale(train_num)
# Principal Component Analysis
pca <- prcomp(train_scaled)
pca
@ -345,427 +377,176 @@ data.frame(pca$x[,1:2], bw=train_samp$bw) %>%
### Insights Gained from Exploration
From the previous visual and statistical exploration of the data, it becomes clear what the challenge is. Ransomware addresses are very sparse in the data set, making up less than 2% of the addresses. That small percentage is also further classified into 28 groups. Perhaps the original paper was a bit too ambitious in trying to categorize all the addresses into 29 categories, including the "white" addresses. To simplify our approach, we will categorize the addresses in a binary way, either "white" or "black", where "black" signifies an association with ransomware transactions. Asking this as a "ransomware or not-ransomware" question allows for application of methods that are impractical otherwise.
## Modeling approach (chunk #3, needs rewriting of text parts only)
Akcora et al. mention that they tried to model the data using a Random Forests method, but that the complexity of the data set lead to problems with that approach.[3] Switching to a binary perspective on the problem might alleviate some of that complexity, and is worth another look. The topological nature of the way the data set has been described numerically lead me to search for topological machine learning methods. Searching for *topo* in the documentation for the `caret` package [6] resulted in the entry for Self Organizing Maps, supplied by the `kohonen` package. The description at CRAN [7] was intriguing enough for me to investigate further.
Describe how you started with categorical SOMs, switched to binary SOMs, then applied randomForest to the binary problem, and was surprised with the results. Decided to re-apply categorical SOMS to black-only addresses, as predicted by the binary Random forest approach. The result is the following two-step approach, with the optional clustering visualizations at the end
## Modeling approach
### Method Part 1: Binary Random Forests to isolate ransomware addresses first.
```{r random-forest-prep, echo=FALSE, inculde=FALSE}
Akcora et al. mention that they tried to model the data using a Random Forests method, but that the complexity of the data set lead to problems with that approach.[3] Switching to a binary perspective on the problem might alleviate some of that complexity, and is worth another look. The topological nature of the way the data set has been described numerically lead me to search for topological machine learning methods. Searching for *topo* in the documentation for the `caret` package [6] resulted in the entry for Self Organizing Maps, supplied by the `kohonen` package. The description at CRAN [7] was intriguing enough for me to investigate further.
### Method 1: Binary Random Forests
Using the `randomForest` library, we train a model on our training set and test against the "black/white" categorization on our test set.
```{r binary_random_forests, echo=FALSE, include=FALSE}
# Install foreach package if needed
if(!require(randomForest)) install.packages("randomForest")
library(randomForest)
# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
set.seed(23)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)
# Same for test set
test_bw <- test_samp$bw
# Lower CV numbers
# Cross Validation, ten fold
control <- trainControl(method="cv", number = 10)
# Control grid with variation on mtry
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
# Run Cross Validation using control and grid set above
rf_model <- train(train_num, train_bw, method="rf",
trControl = control, tuneGrid=grid)
# Fit model
# Supervised fit of model using cross validated optimization
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
```
We can see that the results are quite good against the smaller test set and the larger validation set.
```{r binary_random_forests-validation, echo=FALSE}
# Check for best tuning parameters
ggplot(rf_model)
rf_model$bestTune
# Check for enough trees
plot(fit_rf)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm_test <- confusionMatrix(y_hat_rf, test_bw)
message("Confusion Matrix for test set:")
# Measure accuracy of model against full ransomware set
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
```
```{r random-forest-output, echo=FALSE}
message("Overall accuracy for the binary separation is ",
cm_test$overall["Accuracy"])
cm_test
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm_validation <- confusionMatrix(y_hat_rf, validation$bw)
message("Confusion Matrix for validation set:")
cm_validation
message("Overall accuracy for the full data set is ",
cm_ransomware$overall["Accuracy"])
cm_ransomware
```
### Method Part 2: Categorical SOMs to categorize predicted ransomware addresses.
### Method 2: Binary SOMs
```{r soms-prep, echo=FALSE, include=FALSE}
If we ask the same question to a more sophisticated and topological approach, how good is the model? Mention how the original paper was topological in nature, an how this lead to the investigation of SOMs. Repeat the binary "b/w" approach using SOMs. This accuracy is still pretty good, but not *as* good as the random forest method. Point out how SOMs are really used for classification into _many_ groups. This leads to an Insight! (see above) What if we first _isolate_ the "black" addresses using Random Forest, and then categorize the black only subset (< 2%) using categorical SOMs. This leads to a 2-part system...
##############################################################################
## Now we use the Random Forest model to exclude the "white" addresses from
## the full ransomware set, to categorize the "black" addresses into families.
##############################################################################
Note to self: I don't even use this part in the final script. Should I leave it out of the paper too?
# Now use this prediction to reduce the original set to only "black" addresses
# First append the full set of predictions to the original set.
ransomware$prediction <- ransomware_y_hat_rf
```{r binary_soms, echo=FALSE, include=FALSE}
# Install kohonen package if needed
if(!require(kohonen)) install.packages("kohonen")
# Filter out all the predicted "white" addresses,
# leaving only predicted "black" addresses
black_addresses <- ransomware %>% filter(prediction=="black")
# Load kohonen library
library(kohonen)
# Split the reduced black-predictions into a training set and a test set @ 50%
test_index <- createDataPartition(y = black_addresses$prediction,
times = 1, p = .5, list = FALSE)
# Install kohonen package if needed
if(!require(parallel)) install.packages("parallel")
train_set <- black_addresses[-test_index,]
test_set <- black_addresses[test_index,]
# Load parallel library
library(parallel)
# Keep only numeric columns, ignoring dates and looped.
#train_num <- train_set %>% select(length, weight, count, neighbors, income)
# Keep only numeric columns, ignoring temporal variables.
train_num <- train_set %>%
select(length, weight, count, looped, neighbors, income)
# SOM function can only work on matrices
#train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
#test_num <- test_set %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
#test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
# "scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Binary outputs, black=ransomware, white=non-ransomware, train set
#train_bw <- train_set$bw %>% classvec2classmat()
# Same for test set
#test_bw <- test_set$bw %>% classvec2classmat()
# Create Data list for supervised SOM
#
#train_list <- list(independent = train_mat, dependent = train_bw)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
#grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$bw))))
#grid_size
# Create SOM grid
#train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed for reproducibility
#set.seed(23)
## Now build the model.
#som_model <- xyf(train_mat, train_bw,
# grid = train_grid,
# rlen = 100,
# mode="pbatch", # or: alpha = c(0.05,0.01),
# cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
# keep.data = TRUE
#)
# Now test predictions
# https://clarkdatalabs.github.io/soms/SOM_NBA
#test_list <- list(independent = test_mat, dependent = test_bw)
#ransomware.prediction <- predict(som_model, newdata = test_list)
# Now test predictions of validation set
# Switching to supervised SOMs
#valid_num <- validation %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
#valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
# "scaled:center"), scale = attr(train_mat, "scaled:scale")))
#valid_bw <- validation$bw
#valid_list <- list(independent = valid_mat, dependent = valid_bw)
# Requires up to 16GB of RAM, skip if resources are limited
#ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
```
```{r binary_soms-cms, echo=FALSE}
#table(test_set$bw, ransomware.prediction$prediction[[2]]) %>% knitr::kable()
#table(validation$bw, ransomware.prediction.validation$prediction[[2]]) %>% knitr::kable()
# These are bogging down the pdf. Choose only a few?
# Visualize clusters
#plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Distance map
#plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize counts
#plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize fan diagram
#plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 1
#plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 2
#plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 3
#plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 4
#plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 5
#plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Confusion Matrix
#cm_bw <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$bw)
#cm_bw$overall
# Now test predictions of validation set
# Confusion Matrix
#cm_bw.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$bw)
#cm_bw.validation$overall
```
### Method 3: Categorical SOMs
Describe categorical SOM work here, show results. This is where the pretty colored hex-graphs show up.
```{r categorical_soms, echo=FALSE, include=FALSE}
# Do this here
# Try categorical SOMs on black-only addresses....
#!! This is NOT right, is it?
#!! It would be even MORE impressive if I removed all the PREDICTED whites from
#!! the test set instead and started there.
blacks <- ransomware %>% filter(!label=="white")
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
set.seed(23)
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
workset_blacks <- blacks[-test_index,]
temp <- blacks[test_index,]
# Make sure addresses in validation set are also in working set...
# validation <- temp %>%
# semi_join(workset, by = "address")
# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)
# ... Or not
validation_blacks <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
set.seed(23)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
train_set <- workset_blacks[-test_index,]
temp <- workset_blacks[test_index,]
# Make sure addresses in validation set are also in working set....
#test_set <- temp %>%
# semi_join(train_set, by = "address")
# Add rows removed from validation set back into working set....
#removed <- anti_join(temp, test_set)
#train_set <- rbind(train_set, removed)
# ....Or not
test_set <- temp
##!! Data preparation is done, now focusing on Self Organizing Maps as our method
##!! Start here after reworking the data prep steps above.
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(length, weight, count, neighbors, income)
# SOM function can only work on matrices
# SOM function can only work on matrices.
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)
# Select non-temporal numerical features only
test_num <- test_set %>%
select(length, weight, count, looped, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Testing data is scaled according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num,
center = attr(train_mat, "scaled:center"),
scale = attr(train_mat, "scaled:scale")))
# Categorical
# Categorical labels for training set
train_label <- train_set$label %>% classvec2classmat()
# Same for test set
test_label <- test_set$label %>% classvec2classmat()
# Create Data list for supervised SOM
#
# Create data list for supervised SOM
train_list <- list(independent = train_mat, dependent = train_label)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
# Formulaic method 1, makes a larger graph in this case
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
# Based on categorical number, method 2, smaller graph with less cells
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
topo="hexagonal", toroidal = TRUE)
# Set magic seed for reproducibility
set.seed(23)
## Now build the model.
## Now build the SOM model using the supervised method xyf()
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = detectCores(), # detectCores() - 1 if system locks during calculation
keep.data = TRUE
grid = train_grid,
rlen = 100,
mode="pbatch",
cores = detectCores(), # Use all cores
# cores = detectCores() - 1, # Leave one core for system
keep.data = TRUE
)
# Now test predictions of test set
# https://clarkdatalabs.github.io/soms/SOM_NBA
# Now test predictions of test set, create data list for test set
test_list <- list(independent = test_mat, dependent = test_label)
# Generate predictions
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
test_set$label)
# Now test predictions of validation set
# Switching to supervised SOMs
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
valid_label <- validation_blacks$label
valid_list <- list(independent = valid_mat, dependent = valid_label)
ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
```
```{r categorical_soms_cms, echo=FALSE}
```{r soms-output, echo=FALSE}
#table(test_set$label, ransomware_group.prediction$prediction[[2]]) %>% knitr::kable()
message("A grid size of ", grid_size, " has been chosen.")
#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) %>% knitr::kable()
table(test_set$label, ransomware_group.prediction$prediction[[2]])
#These re good plots, fix their display somehow...
# Visualize clusters
#plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)
# cat(" \n")
cm_labels
# Distance map
#plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize counts
#plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize fan diagram
#plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 1
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 2
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 3
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 4
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
# cat(" \n")
# Visualize heatmap for variable 5
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
# cat(" \n")
message("Overall accuracy is ", cm_labels$overall["Accuracy"])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
cm_labels$overall
```
### Clustering Visualizations: K-means clustering
```{r clustering-setup, echo=FALSE, include=FALSE}
#############################################################################
## K-Means Clustering to visualize the categorization of the SOM
## For a good tutorial, visit:
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
#############################################################################
# Confusion Matrix
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
cm_labels.validation$overall
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
# Set number of clusters to be equal to number of known ransomware groups
n_groups <- length(unique(ransomware$label)) - 1
n_groups
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
# Generate k-means clustering
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
```
```{r clustering-plot, echo=FALSE}
# Plot clustering results
plot(som_model2,
main = 'K-Means Clustering',
type = "property",
@ -775,19 +556,7 @@ add.cluster.boundaries(som_model2, som.cluster$cluster)
```
### Final Method: Combined Methods 1 and 3
Using the results from Random Forest, isolate the black addresses first, and then run that subset through an SOM algorithm. Compare final results to original paper. These go in a "results" section. (below)
```{r combined_methods, echo=FALSE}
# Do this here
# Still need to put it all into one script, and then reproduce the results here....
```
## Results & Performance
## Results & Performance (chunk #4, write up after chunk #3 is done)
### Results
@ -839,7 +608,7 @@ Kyle Hogan, Jason Hennessey, Andrew Miller, Arvind Narayanan, and Nicolas Christ
```{r end timer, echo=FALSE}
# End timer
message("....that's all, folks!")
toc()
```

Binary file not shown.