diff --git a/Binary_SOM.R b/Binary_SOM.R new file mode 100644 index 0000000..1daa43b --- /dev/null +++ b/Binary_SOM.R @@ -0,0 +1,123 @@ +# Install kohonen package if needed +if(!require(kohonen)) install.packages("kohonen") + +# Load kohonen library +library(kohonen) + +# Install kohonen package if needed +if(!require(parallel)) install.packages("parallel") + +# Load parallel library +library(parallel) + +# Keep only numeric columns, ignoring dates and looped. +train_num <- train_set %>% select(length, weight, count, neighbors, income) + +# SOM function can only work on matrices +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_set$grey %>% classvec2classmat() + +# Same for test set +test_grey <- test_set$grey %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_grey) + +# Calculate idea grid size according to: +# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps + +# Formulaic method 1 +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +# Based on categorical number, method 2 +#grid_size = ceiling(sqrt(length(unique(ransomware$grey)))) +grid_size + +# Create SOM grid +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) + +# Set magic seed for reproducibility +set.seed(5) + +## Now build the model. +som_model <- xyf(train_mat, train_grey, + grid = train_grid, + rlen = 100, + mode="pbatch", # or: alpha = c(0.05,0.01), + cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training + keep.data = TRUE +) + +# Visualize clusters +plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors) + +# Distance map +plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors) + +# Visualize counts +plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors) + +# Visualize fan diagram +plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 1 +plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 2 +plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 3 +plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 4 +plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 5 +plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) + +# Now test predictions +# https://clarkdatalabs.github.io/soms/SOM_NBA + +test_list <- list(independent = test_mat, dependent = test_grey) + +ransomware.prediction <- predict(som_model, newdata = test_list) +table(test_set$grey, ransomware.prediction$prediction[[2]]) + +# Confusion Matrix +cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey) +cm_grey$overall["Accuracy"] +cm_grey + +# Now test predictions of validation set + +# Switching to supervised SOMs +valid_num <- validation %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +valid_grey <- validation$grey + +valid_list <- list(independent = valid_mat, dependent = valid_grey) + +# Requires up to 16GB of RAM, skip if resources are limited +ransomware.prediction.validation <- predict(som_model, newdata = valid_list) +table(validation$grey, ransomware.prediction.validation$prediction[[2]]) + +# Confusion Matrix +cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey) +cm_grey.validation$overall["Accuracy"] +cm_grey.validation + +# Clean up environment +rm(dest_file, url, temp) \ No newline at end of file diff --git a/Categorical_SOM.R b/Categorical_SOM.R new file mode 100644 index 0000000..af69a3b --- /dev/null +++ b/Categorical_SOM.R @@ -0,0 +1,176 @@ +# Try categorical SOMs on black-only addresses.... +#!! This is NOT right, is it? +#!! It would be even MORE impressive if I removed all the PREDICTED whites from +#!! the test set instead and started there. + +blacks <- ransomware %>% filter(!label=="white") + +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes +set.seed(5) +test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE) + +workset_blacks <- blacks[-test_index,] +temp <- blacks[test_index,] + +# Make sure addresses in validation set are also in working set... +# validation <- temp %>% +# semi_join(workset, by = "address") + +# Add rows removed from validation set back into working set... +#removed <- anti_join(temp, validation) +#workset <- rbind(workset, removed) + +# ... Or not +validation_blacks <- temp + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) +set.seed(5) +test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE) + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes +#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) + +train_set <- workset_blacks[-test_index,] +temp <- workset_blacks[test_index,] + +# Make sure addresses in validation set are also in working set.... +#test_set <- temp %>% +# semi_join(train_set, by = "address") + +# Add rows removed from validation set back into working set.... +#removed <- anti_join(temp, test_set) +#train_set <- rbind(train_set, removed) + +# ....Or not +test_set <- temp + +##!! Data preparation is done, now focusing on Self Organizing Maps as our method +##!! Start here after reworking the data prep steps above. + +# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) +train_num <- train_set %>% select(length, weight, count, neighbors, income) + +# SOM function can only work on matrices +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +# Categorical +train_label <- train_set$label %>% classvec2classmat() + +# Same for test set +test_label <- test_set$label %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_label) + +# Calculate idea grid size according to: +# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps + +# Formulaic method 1 +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +# Based on categorical number, method 2 +#grid_size = ceiling(sqrt(length(unique(ransomware$label)))) +grid_size + +# Create SOM grid +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) + +# Set magic seed for reproducibility +set.seed(5) + +## Now build the model. +som_model2 <- xyf(train_mat, train_label, + grid = train_grid, + rlen = 100, + mode="pbatch", # or: alpha = c(0.05,0.01), + cores = detectCores(), # detectCores() - 1 if system locks during calculation + keep.data = TRUE +) + +# Visualize clusters +plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors) + +# Distance map +plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors) + +# Visualize counts +plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors) + +# Visualize fan diagram +plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 1 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 2 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 3 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 4 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) + +# Visualize heatmap for variable 5 +plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) + +# Now test predictions of test set +# https://clarkdatalabs.github.io/soms/SOM_NBA + +test_list <- list(independent = test_mat, dependent = test_label) + +ransomware_group.prediction <- predict(som_model2, newdata = test_list) +table(test_set$label, ransomware_group.prediction$prediction[[2]]) + +# Confusion Matrix +cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label) +cm_labels$overall["Accuracy"] +cm_labels + +# Now test predictions of validation set + +# Switching to supervised SOMs +valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + + +valid_label <- validation_blacks$label + +valid_list <- list(independent = valid_mat, dependent = valid_label) + +ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list) +table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) + +# Confusion Matrix +cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label) +cm_labels.validation$overall["Accuracy"] +cm_labels.validation + +# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) +n_groups <- length(unique(ransomware$label)) - 1 +n_groups + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) + +plot(som_model2, + main = 'K-Means Clustering', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model2, som.cluster$cluster) + +# Clean up environment +rm(grid_size, blacks, test_list, valid_list, temp, som.cluster) diff --git a/Cluster_Optimization.R b/Clusters.R similarity index 57% rename from Cluster_Optimization.R rename to Clusters.R index 769f881..cf24c11 100644 --- a/Cluster_Optimization.R +++ b/Clusters.R @@ -3,9 +3,9 @@ #To solve it we use the within-clusters sum of squares method: # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ -wss <- (nrow(som.model$codes)-1)*sum(apply(data.frame(som.model$codes),2,var)) +wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var)) for (i in 2:12) { - wss[i] <- sum(kmeans(data.frame(som.model$codes), centers=i)$withinss) + wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss) } plot(wss, @@ -25,17 +25,17 @@ library(factoextra) library(NbClust) # Elbow method -fviz_nbclust(data.frame(som.model$codes), kmeans, method = "wss") + +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") + geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation labs(subtitle = "Elbow method") # add subtitle # Silhouette method -fviz_nbclust(data.frame(som.model$codes), kmeans, method = "silhouette") + +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") + labs(subtitle = "Silhouette method") # Gap statistic set.seed(5) -fviz_nbclust(data.frame(som.model$codes), kmeans, +fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, nstart = 25, method = "gap_stat", nboot = 500 # reduce it for lower computation time (but less precise results) @@ -43,7 +43,7 @@ fviz_nbclust(data.frame(som.model$codes), kmeans, labs(subtitle = "Gap statistic method") nbclust_out <- NbClust( - data = data.frame(som.model$codes), + data = data.frame(som_model$codes[[1]]), distance = "euclidean", min.nc = 2, # minimum number of clusters max.nc = 12, # maximum number of clusters @@ -80,3 +80,61 @@ ggplot(nbclust_plot) + # k <- nbclust_plot$clusters[which.max(nbclust_plot$???)] k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))]) k + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k) + +plot(som_model, + main = '', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model, som.cluster$cluster) + +# Hierarchical Clustering +# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R + +som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single") + +plot(som_model, + main = '', + type = "property", + property = som.cluster$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model, som.cluster$order) + +# Hierarchical K-Means Clustering: Optimize Clusters +# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/ + +# Install factoextra package if needed +if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org") + +# Load factoextra library +library(factoextra) + +res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k) + +# Elements returned by hkmeans() +names(res.hk) + +# Print the results +str(res.hk) + +# Visualize the tree +fviz_dend(res.hk, cex = 0.6, palette = "jco", + rect = TRUE, rect_border = "jco", rect_fill = TRUE) + +# Visualize the hkmeans final clusters +fviz_cluster(res.hk, palette = "jco", repel = TRUE, + ggtheme = theme_classic()) + +plot(som_model, + main = '', + type = "property", + property = res.hk$hclust$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model, res.hk$hclust$order) diff --git a/Clusters_2.R b/Clusters_2.R new file mode 100644 index 0000000..9029f5d --- /dev/null +++ b/Clusters_2.R @@ -0,0 +1,32 @@ +# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites) +n_groups <- length(unique(ransomware$label)) - 1 +n_groups + +# K-Means Clustering +# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/ + +som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups) + +plot(som_model2, + main = '', + type = "property", + property = som.cluster$cluster, + palette.name = topo.colors) +add.cluster.boundaries(som_model2, som.cluster$cluster) + +# Hierarchical Clustering +# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R + +som.cluster <- hclust(dist(data.frame(som_model2$codes[[1]])), method="mcquitty") + +plot(som_model2, + main = '', + type = "property", + property = som.cluster$order, + palette.name = topo.colors) + +add.cluster.boundaries(som_model2, som.cluster$order) + + +# Clean up environment +#rm(som.cluster) diff --git a/Data_Prep.R b/Data_Prep.R new file mode 100644 index 0000000..1f8fe8c --- /dev/null +++ b/Data_Prep.R @@ -0,0 +1,38 @@ +# Install necessary packages +if(!require(tidyverse)) install.packages("tidyverse") +if(!require(caret)) install.packages("caret") + +# Load Libraries +library(tidyverse) +library(caret) + +# Download data +url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" +dest_file <- "data/data.zip" +if(!dir.exists("data"))dir.create("data") +if(!file.exists(dest_file))download.file(url, destfile = dest_file) + +# Unzip +if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data") + +# Import data from CSV +ransomware <- read_csv("data/BitcoinHeistData.csv") + +# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware +ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) + +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) +test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) + +workset <- ransomware[-test_index,] +validation <- ransomware[test_index,] + +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) +test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) + +train_set <- workset[-test_index,] +test_set <- workset[test_index,] + +# Clean up environment +rm(dest_file, url) + diff --git a/Predictions.R b/Predictions.R index 026c3c0..14442c3 100644 --- a/Predictions.R +++ b/Predictions.R @@ -1,8 +1,7 @@ -## Calculate predictions and accuracy, etc. -## Treat as binary first, then maybe switch to categorical? +# Now test predictions +# https://clarkdatalabs.github.io/soms/SOM_NBA -# Binary outputs, black=ransomware, white=non-ransomware -train_grey <- train_set$grey +test_list <- list(independent = test_mat, dependent = test_grey) -# Categorical outputs, for each class of ransomware -train_label <- train_set$label \ No newline at end of file +ransomware.prediction <- predict(som_model, newdata = test_list) +table(test_set$grey, ransomware.prediction$prediction[[2]]) \ No newline at end of file diff --git a/RanFor.R b/RanFor.R new file mode 100644 index 0000000..7d1bd31 --- /dev/null +++ b/RanFor.R @@ -0,0 +1,55 @@ +# Try Random Forest as Method #1, as suggested by original paper. + +# Install foreach package if needed +if(!require(randomForest)) install.packages("randomForest") +library(randomForest) + +# Keep only numeric columns with highest coefficients of variation for dimension reduction +train_num <- train_samp %>% select(neighbors, income) + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_samp$grey + +#Sample every nth row due to memory constraints +set.seed(5) +test_samp <- test_set[seq(1, nrow(train_set), 100), ] + +# Dimension reduction again +test_num <- test_samp %>% select(neighbors, income) + +# Same for test set +test_grey <- test_samp$grey + +# Lower CV numbers +control <- trainControl(method="cv", number = 10) +grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) + +# Train Random Forests model +rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid) + +# Check for best tuning parameters +ggplot(rf_model) +rf_model$bestTune + +# Fit model +fit_rf <- randomForest(train_samp, train_grey, + minNode = rf_model$bestTune$mtry) + +# Check for enough trees +plot(fit_rf) + +# Measure accuracy of model against test sample +y_hat_rf <- predict(fit_rf, test_samp) +cm <- confusionMatrix(y_hat_rf, test_grey) +cm$overall["Accuracy"] +cm + +# Measure accuracy of model against full validation set + +y_hat_rf <- predict(fit_rf, validation) +cm <- confusionMatrix(y_hat_rf, validation$grey) +cm$overall["Accuracy"] +cm + +# Clean up environment +rm(cm, control, fit_rf, grid, y_hat_rf) diff --git a/Ransomware-Bitcoin-Addresses.R b/Ransomware-Bitcoin-Addresses.R index c657272..da3dd5e 100644 --- a/Ransomware-Bitcoin-Addresses.R +++ b/Ransomware-Bitcoin-Addresses.R @@ -1,6 +1,6 @@ # Install necessary packages -if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org") -if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org") +if(!require(tidyverse)) install.packages("tidyverse") +if(!require(caret)) install.packages("caret") # Load Libraries library(tidyverse) @@ -19,16 +19,18 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData. ransomware <- read_csv("data/BitcoinHeistData.csv") # Turn labels into factors, grey is a binary factor for ransomware/non-ransomware -ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) +ransomware <- ransomware %>% mutate(label=as.factor(label), + grey=as.factor(ifelse(label=="white", "white", "black")), + address=as.factor(address)) # Sample every other row (keeping it simple for now) #ransomware <- ransomware[seq(1, nrow(ransomware), 2), ] # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) -#test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) +test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes -test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE) +#test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] temp <- ransomware[test_index,] @@ -37,18 +39,18 @@ temp <- ransomware[test_index,] # validation <- temp %>% # semi_join(workset, by = "address") -# ... Or not -validation <- temp - # Add rows removed from validation set back into working set... #removed <- anti_join(temp, validation) #workset <- rbind(workset, removed) +# ... Or not +validation <- temp + # Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) -#test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) +test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) # Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes -test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) +#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] temp <- workset[test_index,] @@ -65,13 +67,13 @@ temp <- workset[test_index,] test_set <- temp # Clean up environment -rm(dest_file, url, temp, ransomware) +rm(dest_file, url, temp) # Inspect data frames -test_set %>% str() -test_set %>% head() -train_set %>% str() -train_set %>% head() +#test_set %>% str() +#test_set %>% head() +#train_set %>% str() +#train_set %>% head() ## Data preparation is done, now focusing on Self Organizing Maps as our method diff --git a/Ransomware-Bitcoin-Addresses.Rmd b/Ransomware-Bitcoin-Addresses.Rmd index af317f2..5ffb9c8 100644 --- a/Ransomware-Bitcoin-Addresses.Rmd +++ b/Ransomware-Bitcoin-Addresses.Rmd @@ -153,7 +153,7 @@ ransomware <- read_csv("data/BitcoinHeistData.csv") ## Motivation -There is much interest in tracing tranactions on the Bitcoin blockchain. One reason is to identify possible ransonware wallet addresses before they are used. See references for further motivation. +There is much interest in tracing transactions on the Bitcoin blockchain. One reason is to identify possible ransomware wallet addresses before they are used. See references for further motivation. ## Data Set & Variables @@ -167,7 +167,7 @@ Refer to topological descriptions of variables using DAG and reference paper. # Data Exploration & Visualization -Self-organizing maps? kohonen package +Self-organizing maps. kohonen package # Insights diff --git a/SOM_test.R b/SOM_test.R index 291b788..b005aaa 100644 --- a/SOM_test.R +++ b/SOM_test.R @@ -1,73 +1,98 @@ # Install kohonen package if needed -if(!require(kohonen)) install.packages("kohonen", repos = "http://cran.us.r-project.org") +if(!require(kohonen)) install.packages("kohonen") # Load kohonen library library(kohonen) -# Keep only numeric columns, ignoring dates for now -train_num <- train_set %>% select(length, weight, count, looped, neighbors, income) +# Install kohonen package if needed +if(!require(parallel)) install.packages("parallel") + +# Load parallel library +library(parallel) + +# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?) +train_num <- train_set %>% select(year, day, length, weight, count, looped, neighbors, income) # SOM function can only work on matrices -train_SOM <- as.matrix(scale(train_num)) +train_mat <- as.matrix(scale(train_num)) + +# Switching to supervised SOMs +test_num <- test_set %>% select(year, day, length, weight, count, looped, neighbors, income) + +# Note that when we rescale our testing data we need to scale it according to how we scaled our training data. +test_mat <- as.matrix(scale(test_num, center = attr(train_mat, + "scaled:center"), scale = attr(train_mat, "scaled:scale"))) + +## Treat as binary first, then maybe switch to categorical? + +# Binary outputs, black=ransomware, white=non-ransomware, train set +train_grey <- train_set$grey %>% classvec2classmat() + +# Samem for test set +test_grey <- test_set$grey %>% classvec2classmat() + +# Create Data list for supervised SOM +# +train_list <- list(independent = train_mat, dependent = train_grey) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps # Formulaic method 1 -#grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) +grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2 -grid_size = length(unique(train_set$label)) +#grid_size = ceiling(sqrt(length(unique(ransomware$grey)))) grid_size # Create SOM grid -train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = FALSE) +train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE) # Set magic seed for reproducibility set.seed(5) -## Now build the ESOM! -som.model <- som(train_SOM, +## Now build the model. +som_model <- xyf(train_mat, train_grey, grid = train_grid, rlen = 100, mode="pbatch", # or: alpha = c(0.05,0.01), - cores = 4, # how to dynamically set number of cores? + cores = detectCores(), # detectCores() - 1 if system locks during calculation keep.data = TRUE ) # Visualize clusters -plot(som.model, type = 'mapping', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors) # Distance map -plot(som.model, type = 'quality', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors) # Visualize counts -plot(som.model, type = 'counts', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors) # Visualize fan diagram -plot(som.model, type = 'codes', pch = 19, palette.name = topo.colors) +plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 1 -plot(som.model, type = 'property', property = getCodes(som.model)[,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 2 -plot(som.model, type = 'property', property = getCodes(som.model)[,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 3 -plot(som.model, type = 'property', property = getCodes(som.model)[,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 4 -plot(som.model, type = 'property', property = getCodes(som.model)[,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 5 -plot(som.model, type = 'property', property = getCodes(som.model)[,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 6 -plot(som.model, type = 'property', property = getCodes(som.model)[,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 7 -#plot(som.model, type = 'property', property = getCodes(som.model)[,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors) # Visualize heatmap for variable 8 -#plot(som.model, type = 'property', property = getCodes(som.model)[,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors) +plot(som_model, type = 'property', property = som_model$codes[[1]][,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors) ##Different cluster methods branch off here... diff --git a/Visuals.R b/Visuals.R new file mode 100644 index 0000000..28f7d6e --- /dev/null +++ b/Visuals.R @@ -0,0 +1,74 @@ +# Do some graphical exploration before applying any models. +# Look at the example work for some ideas. +# Add any compelling visuals as needed here. + +# ?? Cluster graphs go at the end. + +# Install foreach package if needed +if(!require(matrixStats)) install.packages("matrixStats") + +# Load foreach library +library(matrixStats) + +## Principle Component Analysis + +names(ransomware) +str(ransomware) + +#Sample every nth row due to memory constraints +train_samp <- train_set[seq(1, nrow(train_set), 100), ] + +# What percentage of sample is ransomware? +mean(train_samp$grey=="black") + +# Keep only numeric columns +train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income) + +# Keep only numeric columns +train_scaled <- train_num %>% scale() + + +# Histograms of each of the columns to show skewness +train_num$year %>% hist(main = paste("Histogram of","year")) +train_num$day %>% hist(main = paste("Histogram of","day")) +train_num$length %>% hist(main = paste("Histogram of","length")) +train_num$weight %>% hist(main = paste("Histogram of","weight")) +train_num$count %>% hist(main = paste("Histogram of","count")) +train_num$looped %>% hist(main = paste("Histogram of","looped")) +train_num$neighbors %>% hist(main = paste("Histogram of","neighbors")) +train_num$income %>% hist(main = paste("Histogram of","income")) + +# Check for variability across numerical columns using coefficients of variation +sds <- train_num %>% as.matrix() %>% colSds() +means <- train_num %>% as.matrix() %>% colMeans() +coeff_vars <- sds %/% means +plot(coeff_vars) +coeff_vars + +# View distances between points of a sample to look for patterns +x <- train_scaled %>% as.matrix() +d <- dist(x) +image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue + +# Principal Component Analysis +pca <- prcomp(train_scaled) +pca +summary(pca) + +pc <- 1:ncol(train_scaled) +qplot(pc, pca$sdev) + +# Plot the first two PCs with color representing black/white +data.frame(pca$x[,1:2], Grey=train_samp$grey) %>% + sample_n(200) %>% + ggplot(aes(PC1,PC2, fill = Grey))+ + geom_point(cex=3, pch=21) + + coord_fixed(ratio = 1) + +# First two dimensions do NOT preserve distance very well +#d_approx <- dist(pca$x[, 1:2]) +#qplot(d, d_approx) + geom_abline(color="red") + +# Clean up environment +rm(pca, x, coeff_vars, d, means, pc, sds) +