Code is in mostly working state. Need to fix how the categorical data set is generated. Need to celan up and fix some of the images and graphs. THEN, the report can be written.

2021-09-30 01:19:18 -06:00 · 2021-09-30 01:19:18 -06:00 · e4d3e52b12
parent 88d833e231
commit e4d3e52b12
11 changed files with 633 additions and 51 deletions
--- a/Binary_SOM.R
+++ b/Binary_SOM.R
@ -0,0 +1,123 @@
+# Install kohonen package if needed
+if(!require(kohonen)) install.packages("kohonen")
+
+# Load kohonen library
+library(kohonen)
+
+# Install kohonen package if needed
+if(!require(parallel)) install.packages("parallel")
+
+# Load parallel library
+library(parallel)
+
+# Keep only numeric columns, ignoring dates and looped.
+train_num <- train_set %>% select(length, weight, count, neighbors, income)
+
+# SOM function can only work on matrices
+train_mat <- as.matrix(scale(train_num))
+
+# Switching to supervised SOMs
+test_num <- test_set %>% select(length, weight, count, neighbors, income)
+
+# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
+                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+
+# Binary outputs, black=ransomware, white=non-ransomware, train set
+train_grey <- train_set$grey %>% classvec2classmat()
+
+# Same for test set
+test_grey <- test_set$grey %>% classvec2classmat()
+
+# Create Data list for supervised SOM
+# 
+train_list <- list(independent = train_mat, dependent = train_grey)
+
+# Calculate idea grid size according to:
+# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
+
+# Formulaic method 1
+grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
+# Based on categorical number, method 2
+#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
+grid_size
+
+# Create SOM grid
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
+
+# Set magic seed for reproducibility
+set.seed(5)
+
+## Now build the model.
+som_model <- xyf(train_mat, train_grey,
+                 grid = train_grid, 
+                 rlen = 100,
+                 mode="pbatch", # or: alpha = c(0.05,0.01),
+                 cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
+                 keep.data = TRUE
+)
+
+# Visualize clusters
+plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
+
+# Distance map
+plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
+
+# Visualize counts
+plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
+
+# Visualize fan diagram
+plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 1
+plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 2
+plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 3
+plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 4
+plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 5
+plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
+
+# Now test predictions
+# https://clarkdatalabs.github.io/soms/SOM_NBA
+
+test_list <- list(independent = test_mat, dependent = test_grey)
+
+ransomware.prediction <- predict(som_model, newdata = test_list)
+table(test_set$grey, ransomware.prediction$prediction[[2]])
+
+# Confusion Matrix
+cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)
+cm_grey$overall["Accuracy"]
+cm_grey
+
+# Now test predictions of validation set
+
+# Switching to supervised SOMs
+valid_num <- validation %>% select(length, weight, count, neighbors, income)
+
+# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
+valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
+                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+
+valid_grey <- validation$grey
+
+valid_list <- list(independent = valid_mat, dependent = valid_grey)
+
+# Requires up to 16GB of RAM, skip if resources are limited
+ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
+table(validation$grey, ransomware.prediction.validation$prediction[[2]])
+
+# Confusion Matrix
+cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)
+cm_grey.validation$overall["Accuracy"]
+cm_grey.validation
+
+# Clean up environment
+rm(dest_file, url, temp)
--- a/Categorical_SOM.R
+++ b/Categorical_SOM.R
@ -0,0 +1,176 @@
+# Try categorical SOMs on black-only addresses....
+#!! This is NOT right, is it? 
+#!! It would be even MORE impressive if I removed all the PREDICTED whites from 
+#!! the test set instead and started there.
+
+blacks <- ransomware %>% filter(!label=="white")
+
+# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
+set.seed(5)
+test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
+
+workset_blacks <- blacks[-test_index,]
+temp <- blacks[test_index,]
+
+# Make sure addresses in validation set are also in working set...
+# validation <- temp %>% 
+#  semi_join(workset, by = "address")
+
+# Add rows removed from validation set back into working set...
+#removed <- anti_join(temp, validation)
+#workset <- rbind(workset, removed)
+
+# ... Or not
+validation_blacks <- temp
+
+# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
+set.seed(5)
+test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
+
+# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
+#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
+
+train_set <- workset_blacks[-test_index,]
+temp <- workset_blacks[test_index,]
+
+# Make sure addresses in validation set are also in working set....
+#test_set <- temp %>% 
+#  semi_join(train_set, by = "address")
+
+# Add rows removed from validation set back into working set....
+#removed <- anti_join(temp, test_set)
+#train_set <- rbind(train_set, removed)
+
+# ....Or not
+test_set <- temp
+
+##!! Data preparation is done, now focusing on Self Organizing Maps as our method
+##!! Start here after reworking the data prep steps above.
+
+# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
+train_num <- train_set %>% select(length, weight, count, neighbors, income)
+
+# SOM function can only work on matrices
+train_mat <- as.matrix(scale(train_num))
+
+# Switching to supervised SOMs
+test_num <- test_set %>% select(length, weight, count, neighbors, income)
+
+# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
+                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+
+# Categorical
+train_label <- train_set$label %>% classvec2classmat()
+
+# Same for test set
+test_label <- test_set$label %>% classvec2classmat()
+
+# Create Data list for supervised SOM
+# 
+train_list <- list(independent = train_mat, dependent = train_label)
+
+# Calculate idea grid size according to:
+# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
+
+# Formulaic method 1
+grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
+# Based on categorical number, method 2
+#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
+grid_size
+
+# Create SOM grid
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
+
+# Set magic seed for reproducibility
+set.seed(5)
+
+## Now build the model.
+som_model2 <- xyf(train_mat, train_label,
+                 grid = train_grid, 
+                 rlen = 100,
+                 mode="pbatch", # or: alpha = c(0.05,0.01),
+                 cores = detectCores(), # detectCores() - 1 if system locks during calculation
+                 keep.data = TRUE
+)
+
+# Visualize clusters
+plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)
+
+# Distance map
+plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)
+
+# Visualize counts
+plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)
+
+# Visualize fan diagram
+plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 1
+plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 2
+plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 3
+plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 4
+plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
+
+# Visualize heatmap for variable 5
+plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
+
+# Now test predictions of test set
+# https://clarkdatalabs.github.io/soms/SOM_NBA
+
+test_list <- list(independent = test_mat, dependent = test_label)
+
+ransomware_group.prediction <- predict(som_model2, newdata = test_list)
+table(test_set$label, ransomware_group.prediction$prediction[[2]])
+
+# Confusion Matrix
+cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
+cm_labels$overall["Accuracy"]
+cm_labels
+
+# Now test predictions of validation set
+
+# Switching to supervised SOMs
+valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
+
+# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
+valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
+                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+
+
+valid_label <- validation_blacks$label
+
+valid_list <- list(independent = valid_mat, dependent = valid_label)
+
+ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
+table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
+
+# Confusion Matrix
+cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
+cm_labels.validation$overall["Accuracy"]
+cm_labels.validation
+
+# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
+n_groups <- length(unique(ransomware$label)) - 1
+n_groups
+
+# K-Means Clustering
+# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
+
+som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
+
+plot(som_model2,
+     main = 'K-Means Clustering',
+     type = "property",
+     property = som.cluster$cluster,
+     palette.name = topo.colors)
+add.cluster.boundaries(som_model2, som.cluster$cluster)
+
+# Clean up environment
+rm(grid_size, blacks, test_list, valid_list, temp, som.cluster)
--- a/Cluster_Optimization.R
+++ b/Cluster_Optimization.R
@ -3,9 +3,9 @@
 #To solve it we use the within-clusters sum of squares method:
 # https://www.polarmicrobes.org/microbial-community-segmentation-with-r/

-wss <- (nrow(som.model$codes)-1)*sum(apply(data.frame(som.model$codes),2,var)) 
+wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var)) 
 for (i in 2:12) {
-  wss[i] <- sum(kmeans(data.frame(som.model$codes), centers=i)$withinss)
+  wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss)
 }

 plot(wss,
@ -25,17 +25,17 @@ library(factoextra)
 library(NbClust)

 # Elbow method
-fviz_nbclust(data.frame(som.model$codes), kmeans, method = "wss") +
+fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") +
  geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation
  labs(subtitle = "Elbow method") # add subtitle

 # Silhouette method
-fviz_nbclust(data.frame(som.model$codes), kmeans, method = "silhouette") +
+fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

 # Gap statistic
 set.seed(5)
-fviz_nbclust(data.frame(som.model$codes), kmeans,
+fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans,
             nstart = 25,
             method = "gap_stat",
             nboot = 500 # reduce it for lower computation time (but less precise results)
@ -43,7 +43,7 @@ fviz_nbclust(data.frame(som.model$codes), kmeans,
  labs(subtitle = "Gap statistic method")

 nbclust_out <- NbClust(
-  data = data.frame(som.model$codes),
+  data = data.frame(som_model$codes[[1]]),
  distance = "euclidean",
  min.nc = 2, # minimum number of clusters
  max.nc = 12, # maximum number of clusters
@ -80,3 +80,61 @@ ggplot(nbclust_plot) +
 # k <- nbclust_plot$clusters[which.max(nbclust_plot$???)] 
 k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))])
 k
+
+# K-Means Clustering
+# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
+
+som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k)
+
+plot(som_model,
+     main = '',
+     type = "property",
+     property = som.cluster$cluster,
+     palette.name = topo.colors)
+add.cluster.boundaries(som_model, som.cluster$cluster)
+
+# Hierarchical Clustering
+# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
+
+som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single")
+
+plot(som_model,
+     main = '',
+     type = "property",
+     property = som.cluster$order,
+     palette.name = topo.colors)
+
+add.cluster.boundaries(som_model, som.cluster$order)
+
+# Hierarchical K-Means Clustering: Optimize Clusters 
+# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/
+
+# Install factoextra package if needed
+if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
+
+# Load factoextra library
+library(factoextra)
+
+res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k)
+
+# Elements returned by hkmeans()
+names(res.hk)
+
+# Print the results
+str(res.hk)
+
+# Visualize the tree
+fviz_dend(res.hk, cex = 0.6, palette = "jco", 
+          rect = TRUE, rect_border = "jco", rect_fill = TRUE)
+
+# Visualize the hkmeans final clusters
+fviz_cluster(res.hk, palette = "jco", repel = TRUE,
+             ggtheme = theme_classic())
+
+plot(som_model,
+     main = '',
+     type = "property",
+     property = res.hk$hclust$order,
+     palette.name = topo.colors)
+
+add.cluster.boundaries(som_model, res.hk$hclust$order)
--- a/Clusters_2.R
+++ b/Clusters_2.R
@ -0,0 +1,32 @@
+# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
+n_groups <- length(unique(ransomware$label)) - 1
+n_groups
+
+# K-Means Clustering
+# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
+
+som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
+
+plot(som_model2,
+     main = '',
+     type = "property",
+     property = som.cluster$cluster,
+     palette.name = topo.colors)
+add.cluster.boundaries(som_model2, som.cluster$cluster)
+
+# Hierarchical Clustering
+# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
+
+som.cluster <- hclust(dist(data.frame(som_model2$codes[[1]])), method="mcquitty")
+
+plot(som_model2,
+     main = '',
+     type = "property",
+     property = som.cluster$order,
+     palette.name = topo.colors)
+
+add.cluster.boundaries(som_model2, som.cluster$order)
+
+
+# Clean up environment
+#rm(som.cluster)
--- a/Data_Prep.R
+++ b/Data_Prep.R
@ -0,0 +1,38 @@
+# Install necessary packages
+if(!require(tidyverse)) install.packages("tidyverse")
+if(!require(caret)) install.packages("caret")
+
+# Load Libraries
+library(tidyverse)
+library(caret)
+
+# Download data
+url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
+dest_file <- "data/data.zip"
+if(!dir.exists("data"))dir.create("data")
+if(!file.exists(dest_file))download.file(url, destfile = dest_file)
+
+# Unzip
+if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
+
+# Import data from CSV
+ransomware <- read_csv("data/BitcoinHeistData.csv")
+
+# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
+ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
+
+# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
+test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
+
+workset <- ransomware[-test_index,]
+validation <- ransomware[test_index,]
+
+# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
+test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
+
+train_set <- workset[-test_index,]
+test_set <- workset[test_index,]
+
+# Clean up environment
+rm(dest_file, url)
+
--- a/Predictions.R
+++ b/Predictions.R
@ -1,8 +1,7 @@
-##  Calculate predictions and accuracy, etc.
-##  Treat as binary first, then maybe switch to categorical?
+# Now test predictions
+# https://clarkdatalabs.github.io/soms/SOM_NBA

-# Binary outputs, black=ransomware, white=non-ransomware
-train_grey <- train_set$grey
+test_list <- list(independent = test_mat, dependent = test_grey)

-# Categorical outputs, for each class of ransomware
-train_label <- train_set$label
+ransomware.prediction <- predict(som_model, newdata = test_list)
+table(test_set$grey, ransomware.prediction$prediction[[2]])
--- a/RanFor.R
+++ b/RanFor.R
@ -0,0 +1,55 @@
+# Try Random Forest as Method #1, as suggested by original paper.
+
+# Install foreach package if needed
+if(!require(randomForest)) install.packages("randomForest")
+library(randomForest)
+
+# Keep only numeric columns with highest coefficients of variation for dimension reduction
+train_num <- train_samp %>% select(neighbors, income)
+
+# Binary outputs, black=ransomware, white=non-ransomware, train set
+train_grey <- train_samp$grey 
+
+#Sample every nth row due to memory constraints
+set.seed(5)
+test_samp <- test_set[seq(1, nrow(train_set), 100), ]
+
+# Dimension reduction again
+test_num <- test_samp %>% select(neighbors, income)
+
+# Same for test set
+test_grey <- test_samp$grey 
+
+# Lower CV numbers
+control <- trainControl(method="cv", number = 10)
+grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
+
+# Train Random Forests model 
+rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid)
+
+# Check for best tuning parameters
+ggplot(rf_model)
+rf_model$bestTune
+
+# Fit model
+fit_rf <- randomForest(train_samp, train_grey,
+                       minNode = rf_model$bestTune$mtry)
+
+# Check for enough trees
+plot(fit_rf)
+
+# Measure accuracy of model against test sample
+y_hat_rf <- predict(fit_rf, test_samp)
+cm <- confusionMatrix(y_hat_rf, test_grey)
+cm$overall["Accuracy"]
+cm
+
+# Measure accuracy of model against full validation set
+
+y_hat_rf <- predict(fit_rf, validation)
+cm <- confusionMatrix(y_hat_rf, validation$grey)
+cm$overall["Accuracy"]
+cm
+
+# Clean up environment
+rm(cm, control, fit_rf, grid, y_hat_rf)
--- a/Ransomware-Bitcoin-Addresses.R
+++ b/Ransomware-Bitcoin-Addresses.R
@ -1,6 +1,6 @@
 # Install necessary packages
-if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
-if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
+if(!require(tidyverse)) install.packages("tidyverse")
+if(!require(caret)) install.packages("caret")

 # Load Libraries
 library(tidyverse)
@ -19,16 +19,18 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.
 ransomware <- read_csv("data/BitcoinHeistData.csv")

 # Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
-ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
+ransomware <- ransomware %>% mutate(label=as.factor(label), 
+                                    grey=as.factor(ifelse(label=="white", "white", "black")), 
+                                    address=as.factor(address))

 # Sample every other row (keeping it simple for now)
 #ransomware <- ransomware[seq(1, nrow(ransomware), 2), ]

 # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
-#test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
+test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)

 # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
-test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE)
+#test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE)

 workset <- ransomware[-test_index,]
 temp <- ransomware[test_index,]
@ -37,18 +39,18 @@ temp <- ransomware[test_index,]
 # validation <- temp %>% 
 #  semi_join(workset, by = "address")

-# ... Or not
-validation <- temp
-
 # Add rows removed from validation set back into working set...
 #removed <- anti_join(temp, validation)
 #workset <- rbind(workset, removed)

+# ... Or not
+validation <- temp
+
 # Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
-#test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
+test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)

 # Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
-test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
+#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)

 train_set <- workset[-test_index,]
 temp <- workset[test_index,]
@ -65,13 +67,13 @@ temp <- workset[test_index,]
 test_set <- temp

 # Clean up environment
-rm(dest_file, url, temp, ransomware)
+rm(dest_file, url, temp)

 # Inspect data frames
-test_set %>% str()
-test_set %>% head()
-train_set %>% str()
-train_set %>% head()
+#test_set %>% str()
+#test_set %>% head()
+#train_set %>% str()
+#train_set %>% head()

 ## Data preparation is done, now focusing on Self Organizing Maps as our method

--- a/Ransomware-Bitcoin-Addresses.Rmd
+++ b/Ransomware-Bitcoin-Addresses.Rmd
@ -153,7 +153,7 @@ ransomware <- read_csv("data/BitcoinHeistData.csv")

 ## Motivation

-There is much interest in tracing tranactions on the Bitcoin blockchain.  One reason is to identify possible ransonware wallet addresses before they are used.  See references for further motivation.
+There is much interest in tracing transactions on the Bitcoin blockchain.  One reason is to identify possible ransomware wallet addresses before they are used.  See references for further motivation.

 ## Data Set & Variables

@ -167,7 +167,7 @@ Refer to topological descriptions of variables using DAG and reference paper.

 # Data Exploration & Visualization

-Self-organizing maps?  kohonen package
+Self-organizing maps.  kohonen package

 # Insights

--- a/SOM_test.R
+++ b/SOM_test.R
@ -1,73 +1,98 @@
 # Install kohonen package if needed
-if(!require(kohonen)) install.packages("kohonen", repos = "http://cran.us.r-project.org")
+if(!require(kohonen)) install.packages("kohonen")

 # Load kohonen library
 library(kohonen)

-# Keep only numeric columns, ignoring dates for now
-train_num <- train_set %>% select(length, weight, count, looped, neighbors, income)
+# Install kohonen package if needed
+if(!require(parallel)) install.packages("parallel")
+
+# Load parallel library
+library(parallel)
+
+# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
+train_num <- train_set %>% select(year, day, length, weight, count, looped, neighbors, income)

 # SOM function can only work on matrices
-train_SOM <- as.matrix(scale(train_num))
+train_mat <- as.matrix(scale(train_num))
+
+# Switching to supervised SOMs
+test_num <- test_set %>% select(year, day, length, weight, count, looped, neighbors, income)
+
+# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
+                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+
+##  Treat as binary first, then maybe switch to categorical?
+
+# Binary outputs, black=ransomware, white=non-ransomware, train set
+train_grey <- train_set$grey %>% classvec2classmat()
+
+# Samem for test set
+test_grey <- test_set$grey %>% classvec2classmat()
+
+# Create Data list for supervised SOM
+# 
+train_list <- list(independent = train_mat, dependent = train_grey)

 # Calculate idea grid size according to:
 # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps

 # Formulaic method 1
-#grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
+grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
 # Based on categorical number, method 2
-grid_size = length(unique(train_set$label))
+#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
 grid_size

 # Create SOM grid
-train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = FALSE)
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)

 # Set magic seed for reproducibility
 set.seed(5)

-## Now build the ESOM!
-som.model <- som(train_SOM, 
+## Now build the model.
+som_model <- xyf(train_mat, train_grey,
                       grid = train_grid, 
                       rlen = 100,
                       mode="pbatch", # or: alpha = c(0.05,0.01),
-                       cores = 4, # how to dynamically set number of cores?
+                       cores = detectCores(), # detectCores() - 1 if system locks during calculation
                       keep.data = TRUE
                       )

 # Visualize clusters
-plot(som.model, type = 'mapping', pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)

 # Distance map
-plot(som.model, type = 'quality', pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)

 # Visualize counts
-plot(som.model, type = 'counts', pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)

 # Visualize fan diagram
-plot(som.model, type = 'codes', pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 1
-plot(som.model, type = 'property', property = getCodes(som.model)[,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 2
-plot(som.model, type = 'property', property = getCodes(som.model)[,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 3
-plot(som.model, type = 'property', property = getCodes(som.model)[,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 4
-plot(som.model, type = 'property', property = getCodes(som.model)[,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 5
-plot(som.model, type = 'property', property = getCodes(som.model)[,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 6
-plot(som.model, type = 'property', property = getCodes(som.model)[,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 7
-#plot(som.model, type = 'property', property = getCodes(som.model)[,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors)

 # Visualize heatmap for variable 8
-#plot(som.model, type = 'property', property = getCodes(som.model)[,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors)
+plot(som_model, type = 'property', property = som_model$codes[[1]][,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors)

 ##Different cluster methods branch off here...
--- a/Visuals.R
+++ b/Visuals.R
@ -0,0 +1,74 @@
+# Do some graphical exploration before applying any models.
+# Look at the example work for some ideas.
+# Add any compelling visuals as needed here.
+
+# ??  Cluster graphs go at the end.
+
+# Install foreach package if needed
+if(!require(matrixStats)) install.packages("matrixStats")
+
+# Load foreach library
+library(matrixStats)
+
+## Principle Component Analysis
+
+names(ransomware)
+str(ransomware)
+
+#Sample every nth row due to memory constraints
+train_samp <- train_set[seq(1, nrow(train_set), 100), ]
+
+# What percentage of sample is ransomware?
+mean(train_samp$grey=="black")
+
+# Keep only numeric columns
+train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
+
+# Keep only numeric columns
+train_scaled <- train_num %>% scale()
+
+
+# Histograms of each of the columns to show skewness
+train_num$year %>% hist(main = paste("Histogram of","year"))
+train_num$day %>% hist(main = paste("Histogram of","day"))
+train_num$length %>% hist(main = paste("Histogram of","length"))
+train_num$weight %>% hist(main = paste("Histogram of","weight"))
+train_num$count %>% hist(main = paste("Histogram of","count"))
+train_num$looped %>% hist(main = paste("Histogram of","looped"))
+train_num$neighbors %>% hist(main = paste("Histogram of","neighbors"))
+train_num$income %>% hist(main = paste("Histogram of","income"))
+
+# Check for variability across numerical columns using coefficients of variation
+sds <- train_num %>% as.matrix() %>% colSds()
+means <- train_num %>% as.matrix() %>% colMeans()
+coeff_vars <- sds %/% means
+plot(coeff_vars)
+coeff_vars
+
+# View distances between points of a sample to look for patterns
+x <- train_scaled %>% as.matrix()
+d <- dist(x)
+image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue
+
+# Principal Component Analysis
+pca <- prcomp(train_scaled)
+pca
+summary(pca)
+
+pc <- 1:ncol(train_scaled)
+qplot(pc, pca$sdev)
+
+# Plot the first two PCs with color representing black/white
+data.frame(pca$x[,1:2], Grey=train_samp$grey) %>%
+  sample_n(200) %>%
+  ggplot(aes(PC1,PC2, fill = Grey))+
+  geom_point(cex=3, pch=21) +
+  coord_fixed(ratio = 1)
+
+# First two dimensions do NOT preserve distance very well
+#d_approx <- dist(pca$x[, 1:2])
+#qplot(d, d_approx) + geom_abline(color="red")
+
+# Clean up environment
+rm(pca, x, coeff_vars, d, means, pc, sds)
+