Code is in mostly working state. Need to fix how the categorical data set is generated. Need to celan up and fix some of the images and graphs. THEN, the report can be written.

This commit is contained in:
shelldweller 2021-09-30 01:19:18 -06:00
parent 88d833e231
commit e4d3e52b12
11 changed files with 633 additions and 51 deletions

123
Binary_SOM.R Normal file
View File

@ -0,0 +1,123 @@
# Install kohonen package if needed
if(!require(kohonen)) install.packages("kohonen")
# Load kohonen library
library(kohonen)
# Install kohonen package if needed
if(!require(parallel)) install.packages("parallel")
# Load parallel library
library(parallel)
# Keep only numeric columns, ignoring dates and looped.
train_num <- train_set %>% select(length, weight, count, neighbors, income)
# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_set$grey %>% classvec2classmat()
# Same for test set
test_grey <- test_set$grey %>% classvec2classmat()
# Create Data list for supervised SOM
#
train_list <- list(independent = train_mat, dependent = train_grey)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed for reproducibility
set.seed(5)
## Now build the model.
som_model <- xyf(train_mat, train_grey,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
keep.data = TRUE
)
# Visualize clusters
plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
# Distance map
plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
# Visualize counts
plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
# Visualize fan diagram
plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 1
plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 2
plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 3
plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 4
plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 5
plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
# Now test predictions
# https://clarkdatalabs.github.io/soms/SOM_NBA
test_list <- list(independent = test_mat, dependent = test_grey)
ransomware.prediction <- predict(som_model, newdata = test_list)
table(test_set$grey, ransomware.prediction$prediction[[2]])
# Confusion Matrix
cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)
cm_grey$overall["Accuracy"]
cm_grey
# Now test predictions of validation set
# Switching to supervised SOMs
valid_num <- validation %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
valid_grey <- validation$grey
valid_list <- list(independent = valid_mat, dependent = valid_grey)
# Requires up to 16GB of RAM, skip if resources are limited
ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
table(validation$grey, ransomware.prediction.validation$prediction[[2]])
# Confusion Matrix
cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)
cm_grey.validation$overall["Accuracy"]
cm_grey.validation
# Clean up environment
rm(dest_file, url, temp)

176
Categorical_SOM.R Normal file
View File

@ -0,0 +1,176 @@
# Try categorical SOMs on black-only addresses....
#!! This is NOT right, is it?
#!! It would be even MORE impressive if I removed all the PREDICTED whites from
#!! the test set instead and started there.
blacks <- ransomware %>% filter(!label=="white")
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
set.seed(5)
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
workset_blacks <- blacks[-test_index,]
temp <- blacks[test_index,]
# Make sure addresses in validation set are also in working set...
# validation <- temp %>%
# semi_join(workset, by = "address")
# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)
# ... Or not
validation_blacks <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
set.seed(5)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
train_set <- workset_blacks[-test_index,]
temp <- workset_blacks[test_index,]
# Make sure addresses in validation set are also in working set....
#test_set <- temp %>%
# semi_join(train_set, by = "address")
# Add rows removed from validation set back into working set....
#removed <- anti_join(temp, test_set)
#train_set <- rbind(train_set, removed)
# ....Or not
test_set <- temp
##!! Data preparation is done, now focusing on Self Organizing Maps as our method
##!! Start here after reworking the data prep steps above.
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(length, weight, count, neighbors, income)
# SOM function can only work on matrices
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Categorical
train_label <- train_set$label %>% classvec2classmat()
# Same for test set
test_label <- test_set$label %>% classvec2classmat()
# Create Data list for supervised SOM
#
train_list <- list(independent = train_mat, dependent = train_label)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed for reproducibility
set.seed(5)
## Now build the model.
som_model2 <- xyf(train_mat, train_label,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = detectCores(), # detectCores() - 1 if system locks during calculation
keep.data = TRUE
)
# Visualize clusters
plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)
# Distance map
plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)
# Visualize counts
plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)
# Visualize fan diagram
plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 1
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 2
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 3
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 4
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 5
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
# Now test predictions of test set
# https://clarkdatalabs.github.io/soms/SOM_NBA
test_list <- list(independent = test_mat, dependent = test_label)
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
table(test_set$label, ransomware_group.prediction$prediction[[2]])
# Confusion Matrix
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
cm_labels$overall["Accuracy"]
cm_labels
# Now test predictions of validation set
# Switching to supervised SOMs
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
valid_label <- validation_blacks$label
valid_list <- list(independent = valid_mat, dependent = valid_label)
ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
# Confusion Matrix
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
cm_labels.validation$overall["Accuracy"]
cm_labels.validation
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
n_groups <- length(unique(ransomware$label)) - 1
n_groups
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
plot(som_model2,
main = 'K-Means Clustering',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)
# Clean up environment
rm(grid_size, blacks, test_list, valid_list, temp, som.cluster)

View File

@ -3,9 +3,9 @@
#To solve it we use the within-clusters sum of squares method:
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
wss <- (nrow(som.model$codes)-1)*sum(apply(data.frame(som.model$codes),2,var))
wss <- (nrow(som_model$codes[[1]])-1)*sum(apply(data.frame(som_model$codes[[1]]),2,var))
for (i in 2:12) {
wss[i] <- sum(kmeans(data.frame(som.model$codes), centers=i)$withinss)
wss[i] <- sum(kmeans(data.frame(som_model$codes[[1]]), centers=i)$withinss)
}
plot(wss,
@ -25,17 +25,17 @@ library(factoextra)
library(NbClust)
# Elbow method
fviz_nbclust(data.frame(som.model$codes), kmeans, method = "wss") +
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) + # add line for better visualisation
labs(subtitle = "Elbow method") # add subtitle
# Silhouette method
fviz_nbclust(data.frame(som.model$codes), kmeans, method = "silhouette") +
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
# Gap statistic
set.seed(5)
fviz_nbclust(data.frame(som.model$codes), kmeans,
fviz_nbclust(data.frame(som_model$codes[[1]]), kmeans,
nstart = 25,
method = "gap_stat",
nboot = 500 # reduce it for lower computation time (but less precise results)
@ -43,7 +43,7 @@ fviz_nbclust(data.frame(som.model$codes), kmeans,
labs(subtitle = "Gap statistic method")
nbclust_out <- NbClust(
data = data.frame(som.model$codes),
data = data.frame(som_model$codes[[1]]),
distance = "euclidean",
min.nc = 2, # minimum number of clusters
max.nc = 12, # maximum number of clusters
@ -80,3 +80,61 @@ ggplot(nbclust_plot) +
# k <- nbclust_plot$clusters[which.max(nbclust_plot$???)]
k <- as.numeric(names(table(nbclust_plot))[which.max(table(nbclust_plot))])
k
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
som.cluster <- kmeans(data.frame(som_model$codes[[1]]), centers=k)
plot(som_model,
main = '',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model, som.cluster$cluster)
# Hierarchical Clustering
# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
som.cluster <- hclust(dist(data.frame(som_model$codes[[1]])), method="single")
plot(som_model,
main = '',
type = "property",
property = som.cluster$order,
palette.name = topo.colors)
add.cluster.boundaries(som_model, som.cluster$order)
# Hierarchical K-Means Clustering: Optimize Clusters
# https://www.datanovia.com/en/lessons/hierarchical-k-means-clustering-optimize-clusters/
# Install factoextra package if needed
if(!require(factoextra)) install.packages("factoextra", repos = "http://cran.us.r-project.org")
# Load factoextra library
library(factoextra)
res.hk <-hkmeans(data.frame(som_model$codes[[1]]), k)
# Elements returned by hkmeans()
names(res.hk)
# Print the results
str(res.hk)
# Visualize the tree
fviz_dend(res.hk, cex = 0.6, palette = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
# Visualize the hkmeans final clusters
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
ggtheme = theme_classic())
plot(som_model,
main = '',
type = "property",
property = res.hk$hclust$order,
palette.name = topo.colors)
add.cluster.boundaries(som_model, res.hk$hclust$order)

32
Clusters_2.R Normal file
View File

@ -0,0 +1,32 @@
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
n_groups <- length(unique(ransomware$label)) - 1
n_groups
# K-Means Clustering
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
plot(som_model2,
main = '',
type = "property",
property = som.cluster$cluster,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$cluster)
# Hierarchical Clustering
# https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
som.cluster <- hclust(dist(data.frame(som_model2$codes[[1]])), method="mcquitty")
plot(som_model2,
main = '',
type = "property",
property = som.cluster$order,
palette.name = topo.colors)
add.cluster.boundaries(som_model2, som.cluster$order)
# Clean up environment
#rm(som.cluster)

38
Data_Prep.R Normal file
View File

@ -0,0 +1,38 @@
# Install necessary packages
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
# Load Libraries
library(tidyverse)
library(caret)
# Download data
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
dest_file <- "data/data.zip"
if(!dir.exists("data"))dir.create("data")
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
# Unzip
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
# Clean up environment
rm(dest_file, url)

View File

@ -1,8 +1,7 @@
## Calculate predictions and accuracy, etc.
## Treat as binary first, then maybe switch to categorical?
# Now test predictions
# https://clarkdatalabs.github.io/soms/SOM_NBA
# Binary outputs, black=ransomware, white=non-ransomware
train_grey <- train_set$grey
test_list <- list(independent = test_mat, dependent = test_grey)
# Categorical outputs, for each class of ransomware
train_label <- train_set$label
ransomware.prediction <- predict(som_model, newdata = test_list)
table(test_set$grey, ransomware.prediction$prediction[[2]])

55
RanFor.R Normal file
View File

@ -0,0 +1,55 @@
# Try Random Forest as Method #1, as suggested by original paper.
# Install foreach package if needed
if(!require(randomForest)) install.packages("randomForest")
library(randomForest)
# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_samp$grey
#Sample every nth row due to memory constraints
set.seed(5)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)
# Same for test set
test_grey <- test_samp$grey
# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid)
# Check for best tuning parameters
ggplot(rf_model)
rf_model$bestTune
# Fit model
fit_rf <- randomForest(train_samp, train_grey,
minNode = rf_model$bestTune$mtry)
# Check for enough trees
plot(fit_rf)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_grey)
cm$overall["Accuracy"]
cm
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$grey)
cm$overall["Accuracy"]
cm
# Clean up environment
rm(cm, control, fit_rf, grid, y_hat_rf)

View File

@ -1,6 +1,6 @@
# Install necessary packages
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
if(!require(tidyverse)) install.packages("tidyverse")
if(!require(caret)) install.packages("caret")
# Load Libraries
library(tidyverse)
@ -19,16 +19,18 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
ransomware <- ransomware %>% mutate(label=as.factor(label),
grey=as.factor(ifelse(label=="white", "white", "black")),
address=as.factor(address))
# Sample every other row (keeping it simple for now)
#ransomware <- ransomware[seq(1, nrow(ransomware), 2), ]
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
#test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE)
#test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
temp <- ransomware[test_index,]
@ -37,18 +39,18 @@ temp <- ransomware[test_index,]
# validation <- temp %>%
# semi_join(workset, by = "address")
# ... Or not
validation <- temp
# Add rows removed from validation set back into working set...
#removed <- anti_join(temp, validation)
#workset <- rbind(workset, removed)
# ... Or not
validation <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
#test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
temp <- workset[test_index,]
@ -65,13 +67,13 @@ temp <- workset[test_index,]
test_set <- temp
# Clean up environment
rm(dest_file, url, temp, ransomware)
rm(dest_file, url, temp)
# Inspect data frames
test_set %>% str()
test_set %>% head()
train_set %>% str()
train_set %>% head()
#test_set %>% str()
#test_set %>% head()
#train_set %>% str()
#train_set %>% head()
## Data preparation is done, now focusing on Self Organizing Maps as our method

View File

@ -153,7 +153,7 @@ ransomware <- read_csv("data/BitcoinHeistData.csv")
## Motivation
There is much interest in tracing tranactions on the Bitcoin blockchain. One reason is to identify possible ransonware wallet addresses before they are used. See references for further motivation.
There is much interest in tracing transactions on the Bitcoin blockchain. One reason is to identify possible ransomware wallet addresses before they are used. See references for further motivation.
## Data Set & Variables
@ -167,7 +167,7 @@ Refer to topological descriptions of variables using DAG and reference paper.
# Data Exploration & Visualization
Self-organizing maps? kohonen package
Self-organizing maps. kohonen package
# Insights

View File

@ -1,73 +1,98 @@
# Install kohonen package if needed
if(!require(kohonen)) install.packages("kohonen", repos = "http://cran.us.r-project.org")
if(!require(kohonen)) install.packages("kohonen")
# Load kohonen library
library(kohonen)
# Keep only numeric columns, ignoring dates for now
train_num <- train_set %>% select(length, weight, count, looped, neighbors, income)
# Install kohonen package if needed
if(!require(parallel)) install.packages("parallel")
# Load parallel library
library(parallel)
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
train_num <- train_set %>% select(year, day, length, weight, count, looped, neighbors, income)
# SOM function can only work on matrices
train_SOM <- as.matrix(scale(train_num))
train_mat <- as.matrix(scale(train_num))
# Switching to supervised SOMs
test_num <- test_set %>% select(year, day, length, weight, count, looped, neighbors, income)
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
## Treat as binary first, then maybe switch to categorical?
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_set$grey %>% classvec2classmat()
# Samem for test set
test_grey <- test_set$grey %>% classvec2classmat()
# Create Data list for supervised SOM
#
train_list <- list(independent = train_mat, dependent = train_grey)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
# Formulaic method 1
#grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
grid_size = length(unique(train_set$label))
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
grid_size
# Create SOM grid
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = FALSE)
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
# Set magic seed for reproducibility
set.seed(5)
## Now build the ESOM!
som.model <- som(train_SOM,
## Now build the model.
som_model <- xyf(train_mat, train_grey,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
cores = 4, # how to dynamically set number of cores?
cores = detectCores(), # detectCores() - 1 if system locks during calculation
keep.data = TRUE
)
# Visualize clusters
plot(som.model, type = 'mapping', pch = 19, palette.name = topo.colors)
plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
# Distance map
plot(som.model, type = 'quality', pch = 19, palette.name = topo.colors)
plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
# Visualize counts
plot(som.model, type = 'counts', pch = 19, palette.name = topo.colors)
plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
# Visualize fan diagram
plot(som.model, type = 'codes', pch = 19, palette.name = topo.colors)
plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 1
plot(som.model, type = 'property', property = getCodes(som.model)[,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 2
plot(som.model, type = 'property', property = getCodes(som.model)[,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 3
plot(som.model, type = 'property', property = getCodes(som.model)[,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 4
plot(som.model, type = 'property', property = getCodes(som.model)[,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 5
plot(som.model, type = 'property', property = getCodes(som.model)[,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 6
plot(som.model, type = 'property', property = getCodes(som.model)[,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,6], main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 7
#plot(som.model, type = 'property', property = getCodes(som.model)[,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,7], main=colnames(train_num)[7], pch = 19, palette.name = topo.colors)
# Visualize heatmap for variable 8
#plot(som.model, type = 'property', property = getCodes(som.model)[,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors)
plot(som_model, type = 'property', property = som_model$codes[[1]][,8], main=colnames(train_num)[8], pch = 19, palette.name = topo.colors)
##Different cluster methods branch off here...

74
Visuals.R Normal file
View File

@ -0,0 +1,74 @@
# Do some graphical exploration before applying any models.
# Look at the example work for some ideas.
# Add any compelling visuals as needed here.
# ?? Cluster graphs go at the end.
# Install foreach package if needed
if(!require(matrixStats)) install.packages("matrixStats")
# Load foreach library
library(matrixStats)
## Principle Component Analysis
names(ransomware)
str(ransomware)
#Sample every nth row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# What percentage of sample is ransomware?
mean(train_samp$grey=="black")
# Keep only numeric columns
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
# Keep only numeric columns
train_scaled <- train_num %>% scale()
# Histograms of each of the columns to show skewness
train_num$year %>% hist(main = paste("Histogram of","year"))
train_num$day %>% hist(main = paste("Histogram of","day"))
train_num$length %>% hist(main = paste("Histogram of","length"))
train_num$weight %>% hist(main = paste("Histogram of","weight"))
train_num$count %>% hist(main = paste("Histogram of","count"))
train_num$looped %>% hist(main = paste("Histogram of","looped"))
train_num$neighbors %>% hist(main = paste("Histogram of","neighbors"))
train_num$income %>% hist(main = paste("Histogram of","income"))
# Check for variability across numerical columns using coefficients of variation
sds <- train_num %>% as.matrix() %>% colSds()
means <- train_num %>% as.matrix() %>% colMeans()
coeff_vars <- sds %/% means
plot(coeff_vars)
coeff_vars
# View distances between points of a sample to look for patterns
x <- train_scaled %>% as.matrix()
d <- dist(x)
image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue
# Principal Component Analysis
pca <- prcomp(train_scaled)
pca
summary(pca)
pc <- 1:ncol(train_scaled)
qplot(pc, pca$sdev)
# Plot the first two PCs with color representing black/white
data.frame(pca$x[,1:2], Grey=train_samp$grey) %>%
sample_n(200) %>%
ggplot(aes(PC1,PC2, fill = Grey))+
geom_point(cex=3, pch=21) +
coord_fixed(ratio = 1)
# First two dimensions do NOT preserve distance very well
#d_approx <- dist(pca$x[, 1:2])
#qplot(d, d_approx) + geom_abline(color="red")
# Clean up environment
rm(pca, x, coeff_vars, d, means, pc, sds)