From e4d3e52b12fd1abae7ebbf23a0fab61921b165b9 Mon Sep 17 00:00:00 2001 From: shelldweller Date: Thu, 30 Sep 2021 01:19:18 -0600 Subject: [PATCH] Code is in mostly working state. Need to fix how the categorical data set is generated. Need to celan up and fix some of the images and graphs. THEN, the report can be written. --- Binary_SOM.R | 123 +++++++++++++++++++ Categorical_SOM.R | 176 +++++++++++++++++++++++++++ Cluster_Optimization.R => Clusters.R | 70 ++++++++++- Clusters_2.R | 32 +++++ Data_Prep.R | 38 ++++++ Predictions.R | 11 +- RanFor.R | 55 +++++++++ Ransomware-Bitcoin-Addresses.R | 32 ++--- Ransomware-Bitcoin-Addresses.Rmd | 4 +- SOM_test.R | 69 +++++++---- Visuals.R | 74 +++++++++++ 11 files changed, 633 insertions(+), 51 deletions(-) create mode 100644 Binary_SOM.R create mode 100644 Categorical_SOM.R rename Cluster_Optimization.R => Clusters.R (57%) create mode 100644 Clusters_2.R create mode 100644 Data_Prep.R create mode 100644 RanFor.R create mode 100644 Visuals.R diff --git a/Binary_SOM.R b/Binary_SOM.R new file mode 100644 index 0000000..1daa43b --- /dev/null +++ b/Binary_SOM.R @@ -0,0 +1,123 @@ +# Install kohonen package if needed +if(!require(kohonen)) install.packages("kohonen") + +# Load kohonen library +library(kohonen) + +# Install kohonen package if needed +if(!require(parallel)) install.packages("parallel") + +# Load parallel library +library(parallel) + +# Keep only numeric columns, ignoring dates and looped. +train_num <- train_set %>% select(length, weight, count, neighbors, income) + +# SOM function can only work on matrices +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_set$grey %>% classvec2classmat() + +# Same for test set +test_grey <- test_set$grey %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_grey) + +# Calculate idea grid size according to: +# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps + +# Formulaic method 1 +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +# Based on categorical number, method 2 +#grid_size = ceiling(sqrt(length(unique(ransomware$grey)))) +grid_size + +# Create SOM grid +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) + +# Set magic seed for reproducibility +set.seed(5) + +## Now build the model. +som_model <- xyf(train_mat, train_grey, + grid = train_grid, + rlen = 100, + mode="pbatch", # or: alpha = c(0.05,0.01), + cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training + keep.data = TRUE +) + +# Visualize clusters +plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors) + +# Distance map +plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors) + +# Visualize counts +plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors) + +# Visualize fan diagram +plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 1 +plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 2 +plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 3 +plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 4 +plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 5 +plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) + +# Now test predictions +# https://clarkdatalabs.github.io/soms/SOM_NBA + +test_list <- list(independent = test_mat, dependent = test_grey) + +ransomware.prediction <- predict(som_model, newdata = test_list) +table(test_set$grey, ransomware.prediction$prediction[[2]]) + +# Confusion Matrix +cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey) +cm_grey$overall["Accuracy"] +cm_grey + +# Now test predictions of validation set + +# Switching to supervised SOMs +valid_num <- validation %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +valid_grey <- validation$grey + +valid_list <- list(independent = valid_mat, dependent = valid_grey) + +# Requires up to 16GB of RAM, skip if resources are limited +ransomware.prediction.validation <- predict(som_model, newdata = valid_list) +table(validation$grey, ransomware.prediction.validation$prediction[[2]]) + +# Confusion Matrix +cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey) +cm_grey.validation$overall["Accuracy"] +cm_grey.validation + +# Clean up environment +rm(dest_file, url, temp) \ No newline at end of file diff --git a/Categorical_SOM.R b/Categorical_SOM.R new file mode 100644 index 0000000..af69a3b --- /dev/null +++ b/Categorical_SOM.R @@ -0,0 +1,176 @@ +# Try categorical SOMs on black-only addresses.... +#!! This is NOT right, is it? +#!! It would be even MORE impressive if I removed all the PREDICTED whites from +#!! the test set instead and started there. + +blacks <- ransomware %>% filter(!label=="white") + +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes +set.seed(5) +test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE) + +workset_blacks <- blacks[-test_index,] +temp <- blacks[test_index,] + +# Make sure addresses in validation set are also in working set... +# validation <- temp %>% +# semi_join(workset, by = "address") + +# Add rows removed from validation set back into working set... +#removed <- anti_join(temp, validation) +#workset <- rbind(workset, removed) + +# ... Or not +validation_blacks <- temp + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) +set.seed(5) +test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE) + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes +#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) + +train_set <- workset_blacks[-test_index,] +temp <- workset_blacks[test_index,] + +# Make sure addresses in validation set are also in working set.... +#test_set <- temp %>% +# semi_join(train_set, by = "address") + +# Add rows removed from validation set back into working set.... +#removed <- anti_join(temp, test_set) +#train_set <- rbind(train_set, removed) + +# ....Or not +test_set <- temp + +##!! Data preparation is done, now focusing on Self Organizing Maps as our method +##!! Start here after reworking the data prep steps above. + +# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) +train_num <- train_set %>% select(length, weight, count, neighbors, income) + +# SOM function can only work on matrices +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +# Categorical +train_label <- train_set$label %>% classvec2classmat() + +# Same for test set +test_label <- test_set$label %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_label) + +# Calculate idea grid size according to: +# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps + +# Formulaic method 1 +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +# Based on categorical number, method 2 +#grid_size = ceiling(sqrt(length(unique(ransomware$label)))) +grid_size + +# Create SOM grid +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) + +# Set magic seed for reproducibility +set.seed(5) + +## Now build the model. +som_model2 <- xyf(train_mat, train_label, + grid = train_grid, + rlen = 100, + mode="pbatch", # or: alpha = c(0.05,0.01), + cores = detectCores(), # detectCores() - 1 if system locks during calculation + keep.data = TRUE +) + +# Visualize clusters +plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors) + +# Distance map +plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors) + +# Visualize counts +plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors) + +# Visualize fan diagram +plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 1 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 2 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 3 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 4 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 5 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) + +# Now test predictions of test set +# https://clarkdatalabs.github.io/soms/SOM_NBA + +test_list <- list(independent = test_mat, dependent = test_label) + +ransomware_group.prediction <- predict(som_model2, newdata = test_list) +table(test_set$label, ransomware_group.prediction$prediction[[2]]) + +# Confusion Matrix +cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label) +cm_labels$overall["Accuracy"] +cm_labels + +# Now test predictions of validation set + +# Switching to supervised SOMs +valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + + +valid_label <- validation_blacks$label + +valid_list <- list(independent = valid_mat, dependent = valid_label) + +ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list) +table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) + +# Confusion Matrix +cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label) +cm_labels.validation$overall["Accuracy"] +cm_labels.validation + +# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) +n_groups <- length(unique(ransomware$label)) - 1 +n_groups + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) + +plot(som_model2, + main = 'K-Means Clustering', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model2, som.cluster$cluster) + +# Clean up environment +rm(grid_size, blacks, test_list, valid_list, temp, som.cluster) diff --git a/Cluster_Optimization.R b/Clusters.R similarity index 57% rename from Cluster_Optimization.R rename to Clusters.R index 769f881..cf24c11 100644 --- a/Cluster_Optimization.R +++ b/Clusters.R @@ -3,9 +3,9 @@ #To solve it we use the within-clusters sum of squares method: # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ -wss <- (nrow(som.model$codes)-1)*sum(apply(data.frame(som.model$codes),2,var)) +wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var)) for (i in 2:12) { - wss[i] <- sum(kmeans(data.frame(som.model$codes), centers=i)$withinss) + wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss) } plot(wss, @@ -25,17 +25,17 @@ library(factoextra) library(NbClust) # Elbow method -fviz_nbclust(data.frame(som.model$codes), kmeans, method = "wss") + +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation labs(subtitle = "Elbow method") # add subtitle # Silhouette method -fviz_nbclust(data.frame(som.model$codes), kmeans, method = "silhouette") + +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") + labs(subtitle = "Silhouette method") # Gap statistic set.seed(5) -fviz_nbclust(data.frame(som.model$codes), kmeans, +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, nstart = 25, method = "gap_stat", nboot = 500 # reduce it for lower computation time (but less precise results) @@ -43,7 +43,7 @@ fviz_nbclust(data.frame(som.model$codes), kmeans, labs(subtitle = "Gap statistic method") nbclust_out <- NbClust( - data = data.frame(som.model$codes), + data = data.frame(som_model$codes[[1]]), distance = "euclidean", min.nc = 2, # minimum number of clusters max.nc = 12, # maximum number of clusters @@ -80,3 +80,61 @@ ggplot(nbclust_plot) + # k <- nbclust_plot$clusters[which.max(nbclust_plot$???)] k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))]) k + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k) + +plot(som_model, + main = '', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model, som.cluster$cluster) + +# Hierarchical Clustering +# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R + +som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single") + +plot(som_model, + main = '', + type = "property", + property = som.cluster$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model, som.cluster$order) + +# Hierarchical K-Means Clustering: Optimize Clusters +# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/ + +# Install factoextra package if needed +if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org") + +# Load factoextra library +library(factoextra) + +res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k) + +# Elements returned by hkmeans() +names(res.hk) + +# Print the results +str(res.hk) + +# Visualize the tree +fviz_dend(res.hk, cex = 0.6, palette = "jco", + rect = TRUE, rect_border = "jco", rect_fill = TRUE) + +# Visualize the hkmeans final clusters +fviz_cluster(res.hk, palette = "jco", repel = TRUE, + ggtheme = theme_classic()) + +plot(som_model, + main = '', + type = "property", + property = res.hk$hclust$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model, res.hk$hclust$order) diff --git a/Clusters_2.R b/Clusters_2.R new file mode 100644 index 0000000..9029f5d --- /dev/null +++ b/Clusters_2.R @@ -0,0 +1,32 @@ +# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) +n_groups <- length(unique(ransomware$label)) - 1 +n_groups + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) + +plot(som_model2, + main = '', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model2, som.cluster$cluster) + +# Hierarchical Clustering +# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R + +som.cluster <- hclust(dist(data.frame(som_model2$codes[[1]])), method="mcquitty") + +plot(som_model2, + main = '', + type = "property", + property = som.cluster$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model2, som.cluster$order) + + +# Clean up environment +#rm(som.cluster) diff --git a/Data_Prep.R b/Data_Prep.R new file mode 100644 index 0000000..1f8fe8c --- /dev/null +++ b/Data_Prep.R @@ -0,0 +1,38 @@ +# Install necessary packages +if(!require(tidyverse)) install.packages("tidyverse") +if(!require(caret)) install.packages("caret") + +# Load Libraries +library(tidyverse) +library(caret) + +# Download data +url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" +dest_file <- "data/data.zip" +if(!dir.exists("data"))dir.create("data") +if(!file.exists(dest_file))download.file(url, destfile = dest_file) + +# Unzip +if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data") + +# Import data from CSV +ransomware <- read_csv("data/BitcoinHeistData.csv") + +# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware +ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) + +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) +test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) + +workset <- ransomware[-test_index,] +validation <- ransomware[test_index,] + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) +test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) + +train_set <- workset[-test_index,] +test_set <- workset[test_index,] + +# Clean up environment +rm(dest_file, url) + diff --git a/Predictions.R b/Predictions.R index 026c3c0..14442c3 100644 --- a/Predictions.R +++ b/Predictions.R @@ -1,8 +1,7 @@ -## Calculate predictions and accuracy, etc. -## Treat as binary first, then maybe switch to categorical? +# Now test predictions +# https://clarkdatalabs.github.io/soms/SOM_NBA -# Binary outputs, black=ransomware, white=non-ransomware -train_grey <- train_set$grey +test_list <- list(independent = test_mat, dependent = test_grey) -# Categorical outputs, for each class of ransomware -train_label <- train_set$label \ No newline at end of file +ransomware.prediction <- predict(som_model, newdata = test_list) +table(test_set$grey, ransomware.prediction$prediction[[2]]) \ No newline at end of file diff --git a/RanFor.R b/RanFor.R new file mode 100644 index 0000000..7d1bd31 --- /dev/null +++ b/RanFor.R @@ -0,0 +1,55 @@ +# Try Random Forest as Method #1, as suggested by original paper. + +# Install foreach package if needed +if(!require(randomForest)) install.packages("randomForest") +library(randomForest) + +# Keep only numeric columns with highest coefficients of variation for dimension reduction +train_num <- train_samp %>% select(neighbors, income) + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_samp$grey + +#Sample every nth row due to memory constraints +set.seed(5) +test_samp <- test_set[seq(1, nrow(train_set), 100), ] + +# Dimension reduction again +test_num <- test_samp %>% select(neighbors, income) + +# Same for test set +test_grey <- test_samp$grey + +# Lower CV numbers +control <- trainControl(method="cv", number = 10) +grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) + +# Train Random Forests model +rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid) + +# Check for best tuning parameters +ggplot(rf_model) +rf_model$bestTune + +# Fit model +fit_rf <- randomForest(train_samp, train_grey, + minNode = rf_model$bestTune$mtry) + +# Check for enough trees +plot(fit_rf) + +# Measure accuracy of model against test sample +y_hat_rf <- predict(fit_rf, test_samp) +cm <- confusionMatrix(y_hat_rf, test_grey) +cm$overall["Accuracy"] +cm + +# Measure accuracy of model against full validation set + +y_hat_rf <- predict(fit_rf, validation) +cm <- confusionMatrix(y_hat_rf, validation$grey) +cm$overall["Accuracy"] +cm + +# Clean up environment +rm(cm, control, fit_rf, grid, y_hat_rf) diff --git a/Ransomware-Bitcoin-Addresses.R b/Ransomware-Bitcoin-Addresses.R index c657272..da3dd5e 100644 --- a/Ransomware-Bitcoin-Addresses.R +++ b/Ransomware-Bitcoin-Addresses.R @@ -1,6 +1,6 @@ # Install necessary packages -if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org") -if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org") +if(!require(tidyverse)) install.packages("tidyverse") +if(!require(caret)) install.packages("caret") # Load Libraries library(tidyverse) @@ -19,16 +19,18 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData. ransomware <- read_csv("data/BitcoinHeistData.csv") # Turn labels into factors, grey is a binary factor for ransomware/non-ransomware -ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) +ransomware <- ransomware %>% mutate(label=as.factor(label), + grey=as.factor(ifelse(label=="white", "white", "black")), + address=as.factor(address)) # Sample every other row (keeping it simple for now) #ransomware <- ransomware[seq(1, nrow(ransomware), 2), ] # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) -#test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) +test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes -test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE) +#test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] temp <- ransomware[test_index,] @@ -37,18 +39,18 @@ temp <- ransomware[test_index,] # validation <- temp %>% # semi_join(workset, by = "address") -# ... Or not -validation <- temp - # Add rows removed from validation set back into working set... #removed <- anti_join(temp, validation) #workset <- rbind(workset, removed) +# ... Or not +validation <- temp + # Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) -#test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) +test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) # Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes -test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) +#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] temp <- workset[test_index,] @@ -65,13 +67,13 @@ temp <- workset[test_index,] test_set <- temp # Clean up environment -rm(dest_file, url, temp, ransomware) +rm(dest_file, url, temp) # Inspect data frames -test_set %>% str() -test_set %>% head() -train_set %>% str() -train_set %>% head() +#test_set %>% str() +#test_set %>% head() +#train_set %>% str() +#train_set %>% head() ## Data preparation is done, now focusing on Self Organizing Maps as our method diff --git a/Ransomware-Bitcoin-Addresses.Rmd b/Ransomware-Bitcoin-Addresses.Rmd index af317f2..5ffb9c8 100644 --- a/Ransomware-Bitcoin-Addresses.Rmd +++ b/Ransomware-Bitcoin-Addresses.Rmd @@ -153,7 +153,7 @@ ransomware <- read_csv("data/BitcoinHeistData.csv") ## Motivation -There is much interest in tracing tranactions on the Bitcoin blockchain. One reason is to identify possible ransonware wallet addresses before they are used. See references for further motivation. +There is much interest in tracing transactions on the Bitcoin blockchain. One reason is to identify possible ransomware wallet addresses before they are used. See references for further motivation. ## Data Set & Variables @@ -167,7 +167,7 @@ Refer to topological descriptions of variables using DAG and reference paper. # Data Exploration & Visualization -Self-organizing maps? kohonen package +Self-organizing maps. kohonen package # Insights diff --git a/SOM_test.R b/SOM_test.R index 291b788..b005aaa 100644 --- a/SOM_test.R +++ b/SOM_test.R @@ -1,73 +1,98 @@ # Install kohonen package if needed -if(!require(kohonen)) install.packages("kohonen", repos = "http://cran.us.r-project.org") +if(!require(kohonen)) install.packages("kohonen") # Load kohonen library library(kohonen) -# Keep only numeric columns, ignoring dates for now -train_num <- train_set %>% select(length, weight, count, looped, neighbors, income) +# Install kohonen package if needed +if(!require(parallel)) install.packages("parallel") + +# Load parallel library +library(parallel) + +# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) +train_num <- train_set %>% select(year, day, length, weight, count, looped, neighbors, income) # SOM function can only work on matrices -train_SOM <- as.matrix(scale(train_num)) +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(year, day, length, weight, count, looped, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +## Treat as binary first, then maybe switch to categorical? + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_set$grey %>% classvec2classmat() + +# Samem for test set +test_grey <- test_set$grey %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_grey) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps # Formulaic method 1 -#grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2 -grid_size = length(unique(train_set$label)) +#grid_size = ceiling(sqrt(length(unique(ransomware$grey)))) grid_size # Create SOM grid -train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = FALSE) +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) # Set magic seed for reproducibility set.seed(5) -## Now build the ESOM! -som.model <- som(train_SOM, +## Now build the model. +som_model <- xyf(train_mat, train_grey, grid = train_grid, rlen = 100, mode="pbatch", # or: alpha = c(0.05,0.01), - cores = 4, # how to dynamically set number of cores? + cores = detectCores(), # detectCores() - 1 if system locks during calculation keep.data = TRUE ) # Visualize clusters -plot(som.model, type = 'mapping', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors) # Distance map -plot(som.model, type = 'quality', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors) # Visualize counts -plot(som.model, type = 'counts', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors) # Visualize fan diagram -plot(som.model, type = 'codes', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 1 -plot(som.model, type = 'property', property = getCodes(som.model)[,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 2 -plot(som.model, type = 'property', property = getCodes(som.model)[,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 3 -plot(som.model, type = 'property', property = getCodes(som.model)[,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 4 -plot(som.model, type = 'property', property = getCodes(som.model)[,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 5 -plot(som.model, type = 'property', property = getCodes(som.model)[,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 6 -plot(som.model, type = 'property', property = getCodes(som.model)[,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 7 -#plot(som.model, type = 'property', property = getCodes(som.model)[,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 8 -#plot(som.model, type = 'property', property = getCodes(som.model)[,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors) ##Different cluster methods branch off here... diff --git a/Visuals.R b/Visuals.R new file mode 100644 index 0000000..28f7d6e --- /dev/null +++ b/Visuals.R @@ -0,0 +1,74 @@ +# Do some graphical exploration before applying any models. +# Look at the example work for some ideas. +# Add any compelling visuals as needed here. + +# ?? Cluster graphs go at the end. + +# Install foreach package if needed +if(!require(matrixStats)) install.packages("matrixStats") + +# Load foreach library +library(matrixStats) + +## Principle Component Analysis + +names(ransomware) +str(ransomware) + +#Sample every nth row due to memory constraints +train_samp <- train_set[seq(1, nrow(train_set), 100), ] + +# What percentage of sample is ransomware? +mean(train_samp$grey=="black") + +# Keep only numeric columns +train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income) + +# Keep only numeric columns +train_scaled <- train_num %>% scale() + + +# Histograms of each of the columns to show skewness +train_num$year %>% hist(main = paste("Histogram of","year")) +train_num$day %>% hist(main = paste("Histogram of","day")) +train_num$length %>% hist(main = paste("Histogram of","length")) +train_num$weight %>% hist(main = paste("Histogram of","weight")) +train_num$count %>% hist(main = paste("Histogram of","count")) +train_num$looped %>% hist(main = paste("Histogram of","looped")) +train_num$neighbors %>% hist(main = paste("Histogram of","neighbors")) +train_num$income %>% hist(main = paste("Histogram of","income")) + +# Check for variability across numerical columns using coefficients of variation +sds <- train_num %>% as.matrix() %>% colSds() +means <- train_num %>% as.matrix() %>% colMeans() +coeff_vars <- sds %/% means +plot(coeff_vars) +coeff_vars + +# View distances between points of a sample to look for patterns +x <- train_scaled %>% as.matrix() +d <- dist(x) +image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue + +# Principal Component Analysis +pca <- prcomp(train_scaled) +pca +summary(pca) + +pc <- 1:ncol(train_scaled) +qplot(pc, pca$sdev) + +# Plot the first two PCs with color representing black/white +data.frame(pca$x[,1:2], Grey=train_samp$grey) %>% + sample_n(200) %>% + ggplot(aes(PC1,PC2, fill = Grey))+ + geom_point(cex=3, pch=21) + + coord_fixed(ratio = 1) + +# First two dimensions do NOT preserve distance very well +#d_approx <- dist(pca$x[, 1:2]) +#qplot(d, d_approx) + geom_abline(color="red") + +# Clean up environment +rm(pca, x, coeff_vars, d, means, pc, sds) +