# Try categorical SOMs on black-only addresses....
#!! This is NOT right, is it? 
#!! It would be even MORE impressive if I removed all the PREDICTED whites from 
#!! the test set instead and started there.

library(tidyverse)
library(caret)
library(randomForest)
library(kohonen)
library(parallel)
library(matrixStats)

blacks <- ransomware %>% filter(!label=="white")

# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
set.seed(5)
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)

workset_blacks <- blacks[-test_index,]
temp <- blacks[test_index,]

# Make sure addresses in validation set are also in working set...
# validation <- temp %>% 
#  semi_join(workset, by = "address")

# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)

# ... Or not
validation_blacks <- temp

# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
set.seed(5)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)

# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)

train_set <- workset_blacks[-test_index,]
temp <- workset_blacks[test_index,]

# Make sure addresses in validation set are also in working set....
#test_set <- temp %>% 
#  semi_join(train_set, by = "address")

# Add rows removed from validation set back into working set....
#removed <- anti_join(temp, test_set)
#train_set <- rbind(train_set, removed)

# ....Or not
test_set <- temp

##!! Data preparation is done, now focusing on Self Organizing Maps as our method
##!! Start here after reworking the data prep steps above.

# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(length, weight, count, neighbors, income)

# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))

# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)

# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))

# Categorical
train_label <- train_set$label %>% classvec2classmat()

# Same for test set
test_label <- test_set$label %>% classvec2classmat()

# Create Data list for supervised SOM
# 
train_list <- list(independent = train_mat, dependent = train_label)

# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps

# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size

# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)

# Set magic seed for reproducibility
set.seed(5)

## Now build the model.
som_model2 <- xyf(train_mat, train_label,
                 grid = train_grid, 
                 rlen = 100,
                 mode="pbatch", # or: alpha = c(0.05,0.01),
                 cores = detectCores(), # detectCores() - 1 if system locks during calculation
                 keep.data = TRUE
)

# Visualize clusters
plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)

# Distance map
plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)

# Visualize counts
plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)

# Visualize fan diagram
plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 1
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 2
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 3
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 4
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)

# Visualize heatmap for variable 5
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)

# Now test predictions of test set
# https://clarkdatalabs.github.io/soms/SOM_NBA

test_list <- list(independent = test_mat, dependent = test_label)

ransomware_group.prediction <- predict(som_model2, newdata = test_list)
table(test_set$label, ransomware_group.prediction$prediction[[2]])

# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
cm_labels$overall["Accuracy"]
cm_labels

# Now test predictions of validation set

# Switching to supervised SOMs
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)

# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))


valid_label <- validation_blacks$label

valid_list <- list(independent = valid_mat, dependent = valid_label)

ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])

# Confusion Matrix
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
cm_labels.validation$overall["Accuracy"]
cm_labels.validation

# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
n_groups <- length(unique(ransomware$label)) - 1
n_groups

# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/

som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)

plot(som_model2,
     main = 'K-Means Clustering',
     type = "property",
     property = som.cluster$cluster,
     palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)

# Clean up environment
rm(grid_size, blacks, test_list, valid_list, temp, som.cluster)