The script is complete. The report has all the code from the script, and it compiles. I still need to add the textual parts for chunks 3 and 4. Final step is still the visuals, although I am thinking less is more on that one.
This commit is contained in:
parent
b0b78b546c
commit
b1c41d674c
|
@ -14,6 +14,12 @@
|
|||
library(tictoc)
|
||||
tic(quiet = FALSE)
|
||||
|
||||
# Set the repository
|
||||
r = getOption("repos")
|
||||
r["CRAN"] = "http://cran.us.r-project.org"
|
||||
options(repos = r)
|
||||
rm(r)
|
||||
|
||||
# Install necessary packages if not already present
|
||||
if(!require(tidyverse)) install.packages("tidyverse")
|
||||
if(!require(caret)) install.packages("caret")
|
||||
|
@ -37,7 +43,7 @@ dest_file <- "data/data.zip"
|
|||
if(!dir.exists("data"))dir.create("data")
|
||||
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
|
||||
|
||||
# Unzip into CSV
|
||||
# Unzip as CSV
|
||||
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
|
||||
"BitcoinHeistData.csv",
|
||||
exdir="data")
|
||||
|
@ -50,25 +56,25 @@ ransomware <- ransomware %>%
|
|||
mutate(label=as.factor(label),
|
||||
bw=as.factor(ifelse(label=="white", "white", "black")))
|
||||
|
||||
# Validation set made from 50% of BitcoinHeist data
|
||||
# Validation set made from 50% of BitcoinHeist data, for RAM considerations
|
||||
test_index <- createDataPartition(y = ransomware$bw,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
workset <- ransomware[-test_index,]
|
||||
validation <- ransomware[test_index,]
|
||||
|
||||
# Split the working set into a training set and a test set @ 50%
|
||||
# Split the working set into a training set and a test set @ 50%, RAM dictated
|
||||
test_index <- createDataPartition(y = workset$bw,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
train_set <- workset[-test_index,]
|
||||
test_set <- workset[test_index,]
|
||||
|
||||
|
||||
###############################################################################
|
||||
## Data preparation is now done
|
||||
## Separate into "black" and "white" groups using Random Forests predictions
|
||||
###############################################################################
|
||||
|
||||
# Keep only numeric columns, ignoring temporal features
|
||||
ransomware_num <- ransomware %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
@ -97,59 +103,61 @@ train_samp <- train_set[seq(1, nrow(train_set), 100), ]
|
|||
# Keep only numeric columns with highest coefficients of variation
|
||||
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
|
||||
|
||||
# Binary outputs, black = ransomware, white = non-ransomware, train set
|
||||
# Binary labels, black = ransomware, white = non-ransomware, train set
|
||||
train_bw <- train_samp$bw
|
||||
|
||||
#Sample every nth row due to memory constraints
|
||||
#Sample every 100th row due to memory constraints to make test sample same size.
|
||||
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
|
||||
|
||||
# Dimension reduction again
|
||||
# Dimension reduction again, selecting features with highest CVs
|
||||
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
|
||||
|
||||
# Same for test set
|
||||
# Binary labels for test set
|
||||
test_bw <- test_samp$bw
|
||||
|
||||
# Lower CV numbers
|
||||
# Cross Validation, ten fold
|
||||
control <- trainControl(method="cv", number = 10)
|
||||
|
||||
# Control grid with variation on mtry
|
||||
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
|
||||
|
||||
# Train Random Forests model
|
||||
# Run Cross Validation using control and grid set above
|
||||
rf_model <- train(train_num, train_bw, method="rf",
|
||||
trControl = control, tuneGrid=grid)
|
||||
|
||||
# Fit model
|
||||
# Supervised fit of model using cross validated optimization
|
||||
fit_rf <- randomForest(train_samp, train_bw,
|
||||
minNode = rf_model$bestTune$mtry)
|
||||
|
||||
# Measure accuracy of model against test sample
|
||||
y_hat_rf <- predict(fit_rf, test_samp)
|
||||
cm <- confusionMatrix(y_hat_rf, test_bw)
|
||||
cm_test <- confusionMatrix(y_hat_rf, test_bw)
|
||||
message("Overall accuracy for the binary separation is ",
|
||||
cm$overall["Accuracy"])
|
||||
cm
|
||||
cm_test$overall["Accuracy"])
|
||||
cm_test
|
||||
|
||||
# Measure accuracy of model against full ransomware set
|
||||
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
|
||||
cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
|
||||
message("Overall accuracy for the full data set is ",
|
||||
cm_ransomware$overall["Accuracy"])
|
||||
cm_ransomware
|
||||
|
||||
##############################################################################
|
||||
## Now we use the Random Forest model to exclude the "white" addresses from
|
||||
## the full ransomware set, to categorize the "black" addresses into families.
|
||||
message("Now we further categorize black address into ransomware families.")
|
||||
##############################################################################
|
||||
|
||||
# Measure accuracy of model against full ransomware set
|
||||
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
|
||||
cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
|
||||
message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
|
||||
cm
|
||||
|
||||
# Now use this prediction to reduce the original set to only "black" addresses
|
||||
# First append the full set of predictions to the original set.
|
||||
ransomware$predictions <- ransomware_y_hat_rf
|
||||
ransomware$prediction <- ransomware_y_hat_rf
|
||||
|
||||
# Filter out all the predicted "white" addresses,
|
||||
# leaving only predicted "black" addresses
|
||||
black_addresses <- ransomware %>% filter(predictions=="black")
|
||||
black_addresses <- ransomware %>% filter(prediction=="black")
|
||||
|
||||
# Split the reduced black-predictions into a training set and a test set @ 50%
|
||||
test_index <- createDataPartition(y = black_addresses$predictions,
|
||||
test_index <- createDataPartition(y = black_addresses$prediction,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
train_set <- black_addresses[-test_index,]
|
||||
|
@ -160,10 +168,10 @@ test_set <- black_addresses[test_index,]
|
|||
train_num <- train_set %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
||||
# SOM function can only work on matrices
|
||||
# SOM function can only work on matrices.
|
||||
train_mat <- as.matrix(scale(train_num))
|
||||
|
||||
# Switching to supervised SOMs
|
||||
# Select non-temporal numerical features only
|
||||
test_num <- test_set %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
||||
|
@ -172,64 +180,65 @@ test_mat <- as.matrix(scale(test_num,
|
|||
center = attr(train_mat, "scaled:center"),
|
||||
scale = attr(train_mat, "scaled:scale")))
|
||||
|
||||
# Categorical
|
||||
# Categorical labels for training set
|
||||
train_label <- train_set$label %>% classvec2classmat()
|
||||
|
||||
# Same for test set
|
||||
test_label <- test_set$label %>% classvec2classmat()
|
||||
|
||||
# Create Data list for supervised SOM
|
||||
# Create data list for supervised SOM
|
||||
train_list <- list(independent = train_mat, dependent = train_label)
|
||||
|
||||
# Calculate idea grid size according to:
|
||||
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
||||
|
||||
# Formulaic method 1
|
||||
# Formulaic method 1, makes a larger graph in this case
|
||||
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
||||
|
||||
# Based on categorical number, method 2
|
||||
# Based on categorical number, method 2, smaller graph with less cells
|
||||
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
|
||||
|
||||
grid_size
|
||||
message("A grid size of ", grid_size, " has been chosen.")
|
||||
|
||||
# Create SOM grid
|
||||
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
|
||||
topo="hexagonal", toroidal = TRUE)
|
||||
|
||||
## Now build the model.
|
||||
## Now build the SOM model using the supervised method xyf()
|
||||
som_model2 <- xyf(train_mat, train_label,
|
||||
grid = train_grid,
|
||||
rlen = 100,
|
||||
mode="pbatch",
|
||||
cores = detectCores(),
|
||||
cores = detectCores(), # Use all cores
|
||||
# cores = detectCores() - 1, # Leave one core for system
|
||||
keep.data = TRUE
|
||||
)
|
||||
|
||||
# Now test predictions of test set
|
||||
|
||||
# Now test predictions of test set, create data list for test set
|
||||
test_list <- list(independent = test_mat, dependent = test_label)
|
||||
|
||||
# Generate predictions
|
||||
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
|
||||
table(test_set$label, ransomware_group.prediction$prediction[[2]])
|
||||
|
||||
# Confusion Matrix
|
||||
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
|
||||
test_set$label)
|
||||
message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
|
||||
cm_labels
|
||||
|
||||
|
||||
#############################################################################
|
||||
## K-Means Clustering to visualize the categorization of the SOM
|
||||
## For a good tutorial, visit:
|
||||
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
||||
#############################################################################
|
||||
|
||||
# Set number of clusters to be equal to number of known ransomware groups
|
||||
n_groups <- length(unique(ransomware$label)) - 1
|
||||
n_groups
|
||||
|
||||
# Generate k-means clustering
|
||||
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
|
||||
|
||||
# Plot clustering results
|
||||
plot(som_model2,
|
||||
main = 'K-Means Clustering',
|
||||
type = "property",
|
||||
|
@ -237,6 +246,9 @@ plot(som_model2,
|
|||
palette.name = topo.colors)
|
||||
add.cluster.boundaries(som_model2, som.cluster$cluster)
|
||||
|
||||
message("Overall accuracy is ", cm_labels$overall["Accuracy"])
|
||||
|
||||
|
||||
#End timer
|
||||
toc()
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
title: \vspace{1in}Detecting Ransomware Addresses on the Bitcoin Blockchain using Random Forests and Self Organizing Maps
|
||||
subtitle: \vspace{.5in}HarvardX Final Capstone CYO Project
|
||||
subtitle: \vspace{.5in}HarvardX PH125.9x Final Capstone CYO Project
|
||||
\vspace{.5in}
|
||||
author: "Kaylee Robert Tejeda"
|
||||
date: "10/31/2021"
|
||||
|
@ -79,31 +79,41 @@ These variables are defined rather abstractly, viewing the blockchain as a topol
|
|||
This data set was discovered while exploring the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php)$^{[4]}$ as suggested in the project instructions. The author of this report, interested in Bitcoin and other cryptocurrencies since (unsuccessfully) mining them on an ASUS netbook in rural Peru in late 2010, used *cryptocurrency* as a preliminary search term. This brought up a single data set entitled ["BitcoinHeist: Ransomware Address Data Set"](https://archive.ics.uci.edu/ml/datasets/BitcoinHeistRansomwareAddressDataset#). The data set was downloaded and the exploration began.
|
||||
|
||||
|
||||
```{r data-prep, echo=FALSE, include=FALSE}
|
||||
```{r install-load-libraries&download-data, echo=FALSE, include=FALSE}
|
||||
|
||||
# Set the repository to a known working mirror just in case it has not already been set
|
||||
cat("Setting Seattle repository")
|
||||
# Set the repository
|
||||
r = getOption("repos")
|
||||
r["CRAN"] = "http://cran.fhcrc.org/"
|
||||
r["CRAN"] = "http://cran.us.r-project.org"
|
||||
options(repos = r)
|
||||
rm(r)
|
||||
|
||||
# Install necessary packages
|
||||
# Install necessary packages if not already present
|
||||
if(!require(tidyverse)) install.packages("tidyverse")
|
||||
if(!require(caret)) install.packages("caret")
|
||||
if(!require(randomForest)) install.packages("randomForest")
|
||||
if(!require(kohonen)) install.packages("kohonen")
|
||||
if(!require(parallel)) install.packages("parallel")
|
||||
if(!require(matrixStats)) install.packages("matrixStats")
|
||||
|
||||
# Load Libraries
|
||||
library(tidyverse)
|
||||
library(caret)
|
||||
library(randomForest)
|
||||
library(kohonen)
|
||||
library(parallel)
|
||||
library(matrixStats)
|
||||
|
||||
# Download data
|
||||
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
|
||||
url <-
|
||||
"https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
|
||||
dest_file <- "data/data.zip"
|
||||
if(!dir.exists("data"))dir.create("data")
|
||||
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
|
||||
|
||||
# Unzip
|
||||
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
|
||||
# Unzip as CSV
|
||||
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file,
|
||||
"BitcoinHeistData.csv",
|
||||
exdir="data")
|
||||
|
||||
# Import data from CSV
|
||||
ransomware <- read_csv("data/BitcoinHeistData.csv")
|
||||
|
@ -171,57 +181,80 @@ The original research team downloaded and parsed the entire Bitcoin transaction
|
|||
|
||||
It is immediately apparent that this is a rather large data set. The usual practice of partitioning out eighty to ninety percent of the data for a training set results in a data set that is too large to process given the hardware available. For reasons that no longer apply, the original data set was first split in half with 50% reserved as "validation set" and the other 50% used as the "working set". This working set was again split in half, to give a "training set" that was of a reasonable size to deal with. At this point the partitions were small enough to work with, so the sample partitions were not further refined. This is a potential area for later optimization. Careful sampling was carried out to ensure that the ransomware groups were represented in each sample.
|
||||
|
||||
```{r dataprep, echo=FALSE, include=FALSE}
|
||||
```{r data-prep, echo=FALSE, include=FALSE}
|
||||
|
||||
# ?? Cluster graphs go at the end.
|
||||
# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
|
||||
ransomware <- ransomware %>%
|
||||
mutate(label=as.factor(label),
|
||||
bw=as.factor(ifelse(label=="white", "white", "black")))
|
||||
|
||||
# Install foreach package if needed
|
||||
if(!require(matrixStats)) install.packages("matrixStats")
|
||||
|
||||
# Load foreach library
|
||||
library(matrixStats)
|
||||
|
||||
|
||||
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
|
||||
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
|
||||
|
||||
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
|
||||
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
|
||||
# Validation set made from 50% of BitcoinHeist data, for RAM considerations
|
||||
test_index <- createDataPartition(y = ransomware$bw,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
workset <- ransomware[-test_index,]
|
||||
validation <- ransomware[test_index,]
|
||||
|
||||
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
|
||||
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
|
||||
# Split the working set into a training set and a test set @ 50%, RAM dictated
|
||||
test_index <- createDataPartition(y = workset$bw,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
train_set <- workset[-test_index,]
|
||||
test_set <- workset[test_index,]
|
||||
|
||||
|
||||
#Sample every nth row due to memory constraints
|
||||
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
|
||||
|
||||
# Keep only numeric columns
|
||||
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
|
||||
|
||||
# Keep only numeric columns
|
||||
train_scaled <- train_num %>% scale()
|
||||
|
||||
# Find proportion of full data set that is ransomware
|
||||
|
||||
ransomprop <- mean(ransomware$bw=="black")
|
||||
|
||||
# Check for NAs
|
||||
|
||||
no_nas <- sum(is.na(ransomware))
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Exploration and Visualization (do this part last....)
|
||||
### Exploration and Visualization ( Chunk #2, do this part last....)
|
||||
|
||||
The ransomware addresses make up less than 2% of the overall data set. This presents a challenge as the target observations are sparse within the data set, especially when we consider that this is then divided into 29 subsets. In fact, some of the ransomware groups have only a single member, making categorization a dubious task. At least there are no missing values to worry about.
|
||||
|
||||
```{r cv-calcs, echo=FALSE}
|
||||
|
||||
# Keep only numeric columns, ignoring temporal features
|
||||
ransomware_num <- ransomware %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
||||
# Check for variation across numerical columns using coefficients of variation
|
||||
#
|
||||
# Calculate standard deviations for each column
|
||||
sds <- ransomware_num %>% as.matrix() %>% colSds()
|
||||
|
||||
# Calculate means for each column
|
||||
means <- ransomware_num %>% as.matrix() %>% colMeans()
|
||||
|
||||
# Calculate CVs for each column
|
||||
coeff_vars <- sds %/% means
|
||||
|
||||
# Select the two features with the highest coefficients of variation
|
||||
selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
|
||||
|
||||
#Sample every 100th row due to memory constraints
|
||||
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
|
||||
|
||||
# Keep only numeric columns with highest coefficients of variation
|
||||
train_num <- train_samp %>% select(selected_features[1], selected_features[2])
|
||||
|
||||
# Binary labels, black = ransomware, white = non-ransomware, train set
|
||||
train_bw <- train_samp$bw
|
||||
|
||||
#Sample every 100th row due to memory constraints to make test sample same size.
|
||||
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
|
||||
|
||||
# Dimension reduction again, selecting features with highest CVs
|
||||
test_num <- test_samp %>% select(selected_features[1], selected_features[2])
|
||||
|
||||
# Binary labels for test set
|
||||
test_bw <- test_samp$bw
|
||||
|
||||
```
|
||||
|
||||
```{r data-sparsness, echo=FALSE}
|
||||
|
||||
|
@ -238,7 +271,6 @@ knitr::kable(
|
|||
|
||||
)
|
||||
|
||||
|
||||
```
|
||||
|
||||
Let's take a look at the distribution of the different features. Note how skewed the non-temporal features are, some of them being bimodal:
|
||||
|
@ -271,17 +303,12 @@ ggp2
|
|||
|
||||
Now we can compare the relative spread of each feature by calculating the coefficient of variation for each column. Larger coefficients of variation indicate larger relative spread compared to other columns.
|
||||
|
||||
```{r sds, echo=FALSE}
|
||||
# Check for variability across numerical columns using coefficients of variation
|
||||
|
||||
# Calculate standard deviations for each column
|
||||
sds <- train_num %>% as.matrix() %>% colSds()
|
||||
```{r cv-results, echo=FALSE}
|
||||
|
||||
# Calculate means for each column
|
||||
means <- train_num %>% as.matrix() %>% colMeans()
|
||||
|
||||
# Calculate CVs for each column
|
||||
coeff_vars <- sds %/% means
|
||||
message("The features with the highest coefficients of variation are ",
|
||||
selected_features[1], selected_features[2],
|
||||
", which will be used to train the binary model.")
|
||||
|
||||
# Summarize results in a table and a plot
|
||||
knitr::kable(coeff_vars)
|
||||
|
@ -293,22 +320,25 @@ plot(coeff_vars)
|
|||
From this, it appears that *income* has the widest range of variability, followed by *neighbors*. These are also the features that are most strongly skewed to the right, meaning that a few addresses have really high values for each of these features while the bulk of the data set has very low values for these numbers.
|
||||
|
||||
|
||||
Now do the following (after filling in methods, results, and conclusions, since those are done already:
|
||||
Now do the following (after filling in methods, results, and conclusions, since those are done already):
|
||||
|
||||
|
||||
6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. MORE OF THIS....
|
||||
|
||||
|
||||
```{r percent per column, echo=FALSE}
|
||||
# Do this here
|
||||
```{r shrimp-percentage, echo=FALSE, include=FALSE}
|
||||
# Count how many wallets have less than one full bitcoin
|
||||
|
||||
shrimp <- train_samp %>% filter(income < 10^8 )
|
||||
|
||||
mean(shrimp$bw == "black")
|
||||
|
||||
```
|
||||
|
||||
|
||||
```{r shrimp-output, echo=FALSE}
|
||||
# Print the percentage of wallets with less than one full bitcoin
|
||||
|
||||
mean(shrimp$bw == "black")
|
||||
|
||||
```
|
||||
7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!)
|
||||
|
||||
|
||||
|
@ -320,6 +350,8 @@ mean(shrimp$bw == "black")
|
|||
#d <- dist(x)
|
||||
#image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue
|
||||
|
||||
train_scaled <- scale(train_num)
|
||||
|
||||
# Principal Component Analysis
|
||||
pca <- prcomp(train_scaled)
|
||||
pca
|
||||
|
@ -345,427 +377,176 @@ data.frame(pca$x[,1:2], bw=train_samp$bw) %>%
|
|||
### Insights Gained from Exploration
|
||||
|
||||
From the previous visual and statistical exploration of the data, it becomes clear what the challenge is. Ransomware addresses are very sparse in the data set, making up less than 2% of the addresses. That small percentage is also further classified into 28 groups. Perhaps the original paper was a bit too ambitious in trying to categorize all the addresses into 29 categories, including the "white" addresses. To simplify our approach, we will categorize the addresses in a binary way, either "white" or "black", where "black" signifies an association with ransomware transactions. Asking this as a "ransomware or not-ransomware" question allows for application of methods that are impractical otherwise.
|
||||
|
||||
|
||||
## Modeling approach (chunk #3, needs rewriting of text parts only)
|
||||
|
||||
Akcora et al. mention that they tried to model the data using a Random Forests method, but that the complexity of the data set lead to problems with that approach.[3] Switching to a binary perspective on the problem might alleviate some of that complexity, and is worth another look. The topological nature of the way the data set has been described numerically lead me to search for topological machine learning methods. Searching for *topo* in the documentation for the `caret` package [6] resulted in the entry for Self Organizing Maps, supplied by the `kohonen` package. The description at CRAN [7] was intriguing enough for me to investigate further.
|
||||
|
||||
|
||||
Describe how you started with categorical SOMs, switched to binary SOMs, then applied randomForest to the binary problem, and was surprised with the results. Decided to re-apply categorical SOMS to black-only addresses, as predicted by the binary Random forest approach. The result is the following two-step approach, with the optional clustering visualizations at the end
|
||||
|
||||
## Modeling approach
|
||||
### Method Part 1: Binary Random Forests to isolate ransomware addresses first.
|
||||
```{r random-forest-prep, echo=FALSE, inculde=FALSE}
|
||||
|
||||
Akcora et al. mention that they tried to model the data using a Random Forests method, but that the complexity of the data set lead to problems with that approach.[3] Switching to a binary perspective on the problem might alleviate some of that complexity, and is worth another look. The topological nature of the way the data set has been described numerically lead me to search for topological machine learning methods. Searching for *topo* in the documentation for the `caret` package [6] resulted in the entry for Self Organizing Maps, supplied by the `kohonen` package. The description at CRAN [7] was intriguing enough for me to investigate further.
|
||||
|
||||
### Method 1: Binary Random Forests
|
||||
|
||||
Using the `randomForest` library, we train a model on our training set and test against the "black/white" categorization on our test set.
|
||||
|
||||
```{r binary_random_forests, echo=FALSE, include=FALSE}
|
||||
|
||||
# Install foreach package if needed
|
||||
if(!require(randomForest)) install.packages("randomForest")
|
||||
library(randomForest)
|
||||
|
||||
# Keep only numeric columns with highest coefficients of variation for dimension reduction
|
||||
train_num <- train_samp %>% select(neighbors, income)
|
||||
|
||||
# Binary outputs, black=ransomware, white=non-ransomware, train set
|
||||
train_bw <- train_samp$bw
|
||||
|
||||
#Sample every nth row due to memory constraints
|
||||
set.seed(23)
|
||||
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
|
||||
|
||||
# Dimension reduction again
|
||||
test_num <- test_samp %>% select(neighbors, income)
|
||||
|
||||
# Same for test set
|
||||
test_bw <- test_samp$bw
|
||||
|
||||
# Lower CV numbers
|
||||
# Cross Validation, ten fold
|
||||
control <- trainControl(method="cv", number = 10)
|
||||
|
||||
# Control grid with variation on mtry
|
||||
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
|
||||
|
||||
# Train Random Forests model
|
||||
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
|
||||
# Run Cross Validation using control and grid set above
|
||||
rf_model <- train(train_num, train_bw, method="rf",
|
||||
trControl = control, tuneGrid=grid)
|
||||
|
||||
|
||||
# Fit model
|
||||
# Supervised fit of model using cross validated optimization
|
||||
fit_rf <- randomForest(train_samp, train_bw,
|
||||
minNode = rf_model$bestTune$mtry)
|
||||
|
||||
```
|
||||
|
||||
We can see that the results are quite good against the smaller test set and the larger validation set.
|
||||
|
||||
```{r binary_random_forests-validation, echo=FALSE}
|
||||
|
||||
# Check for best tuning parameters
|
||||
ggplot(rf_model)
|
||||
rf_model$bestTune
|
||||
|
||||
# Check for enough trees
|
||||
plot(fit_rf)
|
||||
|
||||
# Measure accuracy of model against test sample
|
||||
y_hat_rf <- predict(fit_rf, test_samp)
|
||||
cm_test <- confusionMatrix(y_hat_rf, test_bw)
|
||||
|
||||
message("Confusion Matrix for test set:")
|
||||
|
||||
# Measure accuracy of model against full ransomware set
|
||||
ransomware_y_hat_rf <- predict(fit_rf, ransomware)
|
||||
cm_ransomware <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
|
||||
|
||||
```
|
||||
|
||||
```{r random-forest-output, echo=FALSE}
|
||||
|
||||
|
||||
message("Overall accuracy for the binary separation is ",
|
||||
cm_test$overall["Accuracy"])
|
||||
cm_test
|
||||
|
||||
# Measure accuracy of model against full validation set
|
||||
|
||||
y_hat_rf <- predict(fit_rf, validation)
|
||||
cm_validation <- confusionMatrix(y_hat_rf, validation$bw)
|
||||
|
||||
message("Confusion Matrix for validation set:")
|
||||
cm_validation
|
||||
|
||||
message("Overall accuracy for the full data set is ",
|
||||
cm_ransomware$overall["Accuracy"])
|
||||
cm_ransomware
|
||||
|
||||
```
|
||||
|
||||
### Method Part 2: Categorical SOMs to categorize predicted ransomware addresses.
|
||||
|
||||
### Method 2: Binary SOMs
|
||||
```{r soms-prep, echo=FALSE, include=FALSE}
|
||||
|
||||
If we ask the same question to a more sophisticated and topological approach, how good is the model? Mention how the original paper was topological in nature, an how this lead to the investigation of SOMs. Repeat the binary "b/w" approach using SOMs. This accuracy is still pretty good, but not *as* good as the random forest method. Point out how SOMs are really used for classification into _many_ groups. This leads to an Insight! (see above) What if we first _isolate_ the "black" addresses using Random Forest, and then categorize the black only subset (< 2%) using categorical SOMs. This leads to a 2-part system...
|
||||
##############################################################################
|
||||
## Now we use the Random Forest model to exclude the "white" addresses from
|
||||
## the full ransomware set, to categorize the "black" addresses into families.
|
||||
##############################################################################
|
||||
|
||||
Note to self: I don't even use this part in the final script. Should I leave it out of the paper too?
|
||||
# Now use this prediction to reduce the original set to only "black" addresses
|
||||
# First append the full set of predictions to the original set.
|
||||
ransomware$prediction <- ransomware_y_hat_rf
|
||||
|
||||
```{r binary_soms, echo=FALSE, include=FALSE}
|
||||
# Install kohonen package if needed
|
||||
if(!require(kohonen)) install.packages("kohonen")
|
||||
# Filter out all the predicted "white" addresses,
|
||||
# leaving only predicted "black" addresses
|
||||
black_addresses <- ransomware %>% filter(prediction=="black")
|
||||
|
||||
# Load kohonen library
|
||||
library(kohonen)
|
||||
# Split the reduced black-predictions into a training set and a test set @ 50%
|
||||
test_index <- createDataPartition(y = black_addresses$prediction,
|
||||
times = 1, p = .5, list = FALSE)
|
||||
|
||||
# Install kohonen package if needed
|
||||
if(!require(parallel)) install.packages("parallel")
|
||||
train_set <- black_addresses[-test_index,]
|
||||
test_set <- black_addresses[test_index,]
|
||||
|
||||
# Load parallel library
|
||||
library(parallel)
|
||||
|
||||
# Keep only numeric columns, ignoring dates and looped.
|
||||
#train_num <- train_set %>% select(length, weight, count, neighbors, income)
|
||||
# Keep only numeric columns, ignoring temporal variables.
|
||||
train_num <- train_set %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
||||
# SOM function can only work on matrices
|
||||
#train_mat <- as.matrix(scale(train_num))
|
||||
|
||||
# Switching to supervised SOMs
|
||||
#test_num <- test_set %>% select(length, weight, count, neighbors, income)
|
||||
|
||||
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
||||
#test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
|
||||
# "scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
||||
|
||||
# Binary outputs, black=ransomware, white=non-ransomware, train set
|
||||
#train_bw <- train_set$bw %>% classvec2classmat()
|
||||
|
||||
# Same for test set
|
||||
#test_bw <- test_set$bw %>% classvec2classmat()
|
||||
|
||||
# Create Data list for supervised SOM
|
||||
#
|
||||
#train_list <- list(independent = train_mat, dependent = train_bw)
|
||||
|
||||
# Calculate idea grid size according to:
|
||||
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
||||
|
||||
# Formulaic method 1
|
||||
#grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
||||
# Based on categorical number, method 2
|
||||
#grid_size = ceiling(sqrt(length(unique(ransomware$bw))))
|
||||
#grid_size
|
||||
|
||||
# Create SOM grid
|
||||
#train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
|
||||
|
||||
# Set magic seed for reproducibility
|
||||
#set.seed(23)
|
||||
|
||||
## Now build the model.
|
||||
#som_model <- xyf(train_mat, train_bw,
|
||||
# grid = train_grid,
|
||||
# rlen = 100,
|
||||
# mode="pbatch", # or: alpha = c(0.05,0.01),
|
||||
# cores = detectCores(), # detectCores() - 1 if system becomes unresponsive during training
|
||||
# keep.data = TRUE
|
||||
#)
|
||||
|
||||
|
||||
# Now test predictions
|
||||
# https://clarkdatalabs.github.io/soms/SOM_NBA
|
||||
|
||||
#test_list <- list(independent = test_mat, dependent = test_bw)
|
||||
|
||||
#ransomware.prediction <- predict(som_model, newdata = test_list)
|
||||
|
||||
|
||||
|
||||
# Now test predictions of validation set
|
||||
|
||||
# Switching to supervised SOMs
|
||||
#valid_num <- validation %>% select(length, weight, count, neighbors, income)
|
||||
|
||||
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
||||
#valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
|
||||
# "scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
||||
|
||||
#valid_bw <- validation$bw
|
||||
|
||||
#valid_list <- list(independent = valid_mat, dependent = valid_bw)
|
||||
|
||||
# Requires up to 16GB of RAM, skip if resources are limited
|
||||
#ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
```{r binary_soms-cms, echo=FALSE}
|
||||
|
||||
#table(test_set$bw, ransomware.prediction$prediction[[2]]) %>% knitr::kable()
|
||||
|
||||
#table(validation$bw, ransomware.prediction.validation$prediction[[2]]) %>% knitr::kable()
|
||||
|
||||
# These are bogging down the pdf. Choose only a few?
|
||||
# Visualize clusters
|
||||
#plot(som_model, type = 'mapping', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Distance map
|
||||
#plot(som_model, type = 'quality', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize counts
|
||||
#plot(som_model, type = 'counts', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize fan diagram
|
||||
#plot(som_model, type = 'codes', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 1
|
||||
#plot(som_model, type = 'property', property = som_model$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 2
|
||||
#plot(som_model, type = 'property', property = som_model$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 3
|
||||
#plot(som_model, type = 'property', property = som_model$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 4
|
||||
#plot(som_model, type = 'property', property = som_model$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 5
|
||||
#plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Confusion Matrix
|
||||
#cm_bw <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$bw)
|
||||
#cm_bw$overall
|
||||
|
||||
# Now test predictions of validation set
|
||||
|
||||
# Confusion Matrix
|
||||
#cm_bw.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$bw)
|
||||
#cm_bw.validation$overall
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
### Method 3: Categorical SOMs
|
||||
|
||||
Describe categorical SOM work here, show results. This is where the pretty colored hex-graphs show up.
|
||||
|
||||
```{r categorical_soms, echo=FALSE, include=FALSE}
|
||||
# Do this here
|
||||
# Try categorical SOMs on black-only addresses....
|
||||
#!! This is NOT right, is it?
|
||||
#!! It would be even MORE impressive if I removed all the PREDICTED whites from
|
||||
#!! the test set instead and started there.
|
||||
|
||||
blacks <- ransomware %>% filter(!label=="white")
|
||||
|
||||
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
|
||||
set.seed(23)
|
||||
test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
|
||||
|
||||
workset_blacks <- blacks[-test_index,]
|
||||
temp <- blacks[test_index,]
|
||||
|
||||
# Make sure addresses in validation set are also in working set...
|
||||
# validation <- temp %>%
|
||||
# semi_join(workset, by = "address")
|
||||
|
||||
# Add rows removed from validation set back into working set...
|
||||
#removed <- anti_join(temp, validation)
|
||||
#workset <- rbind(workset, removed)
|
||||
|
||||
# ... Or not
|
||||
validation_blacks <- temp
|
||||
|
||||
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
|
||||
set.seed(23)
|
||||
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
|
||||
|
||||
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
|
||||
#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
|
||||
|
||||
train_set <- workset_blacks[-test_index,]
|
||||
temp <- workset_blacks[test_index,]
|
||||
|
||||
# Make sure addresses in validation set are also in working set....
|
||||
#test_set <- temp %>%
|
||||
# semi_join(train_set, by = "address")
|
||||
|
||||
# Add rows removed from validation set back into working set....
|
||||
#removed <- anti_join(temp, test_set)
|
||||
#train_set <- rbind(train_set, removed)
|
||||
|
||||
# ....Or not
|
||||
test_set <- temp
|
||||
|
||||
##!! Data preparation is done, now focusing on Self Organizing Maps as our method
|
||||
##!! Start here after reworking the data prep steps above.
|
||||
|
||||
# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
|
||||
train_num <- train_set %>% select(length, weight, count, neighbors, income)
|
||||
|
||||
# SOM function can only work on matrices
|
||||
# SOM function can only work on matrices.
|
||||
train_mat <- as.matrix(scale(train_num))
|
||||
|
||||
# Switching to supervised SOMs
|
||||
test_num <- test_set %>% select(length, weight, count, neighbors, income)
|
||||
# Select non-temporal numerical features only
|
||||
test_num <- test_set %>%
|
||||
select(length, weight, count, looped, neighbors, income)
|
||||
|
||||
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
||||
test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
|
||||
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
||||
# Testing data is scaled according to how we scaled our training data.
|
||||
test_mat <- as.matrix(scale(test_num,
|
||||
center = attr(train_mat, "scaled:center"),
|
||||
scale = attr(train_mat, "scaled:scale")))
|
||||
|
||||
# Categorical
|
||||
# Categorical labels for training set
|
||||
train_label <- train_set$label %>% classvec2classmat()
|
||||
|
||||
# Same for test set
|
||||
test_label <- test_set$label %>% classvec2classmat()
|
||||
|
||||
# Create Data list for supervised SOM
|
||||
#
|
||||
# Create data list for supervised SOM
|
||||
train_list <- list(independent = train_mat, dependent = train_label)
|
||||
|
||||
# Calculate idea grid size according to:
|
||||
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
|
||||
|
||||
# Formulaic method 1
|
||||
# Formulaic method 1, makes a larger graph in this case
|
||||
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
|
||||
# Based on categorical number, method 2
|
||||
|
||||
# Based on categorical number, method 2, smaller graph with less cells
|
||||
#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
|
||||
grid_size
|
||||
|
||||
# Create SOM grid
|
||||
train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
|
||||
train_grid <- somgrid(xdim=grid_size, ydim=grid_size,
|
||||
topo="hexagonal", toroidal = TRUE)
|
||||
|
||||
# Set magic seed for reproducibility
|
||||
set.seed(23)
|
||||
|
||||
## Now build the model.
|
||||
## Now build the SOM model using the supervised method xyf()
|
||||
som_model2 <- xyf(train_mat, train_label,
|
||||
grid = train_grid,
|
||||
rlen = 100,
|
||||
mode="pbatch", # or: alpha = c(0.05,0.01),
|
||||
cores = detectCores(), # detectCores() - 1 if system locks during calculation
|
||||
keep.data = TRUE
|
||||
grid = train_grid,
|
||||
rlen = 100,
|
||||
mode="pbatch",
|
||||
cores = detectCores(), # Use all cores
|
||||
# cores = detectCores() - 1, # Leave one core for system
|
||||
keep.data = TRUE
|
||||
)
|
||||
|
||||
# Now test predictions of test set
|
||||
# https://clarkdatalabs.github.io/soms/SOM_NBA
|
||||
|
||||
# Now test predictions of test set, create data list for test set
|
||||
test_list <- list(independent = test_mat, dependent = test_label)
|
||||
|
||||
# Generate predictions
|
||||
ransomware_group.prediction <- predict(som_model2, newdata = test_list)
|
||||
|
||||
# Confusion Matrix
|
||||
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
|
||||
test_set$label)
|
||||
|
||||
|
||||
# Now test predictions of validation set
|
||||
|
||||
# Switching to supervised SOMs
|
||||
valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
|
||||
|
||||
# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
|
||||
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
|
||||
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
|
||||
|
||||
|
||||
valid_label <- validation_blacks$label
|
||||
|
||||
valid_list <- list(independent = valid_mat, dependent = valid_label)
|
||||
|
||||
ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r categorical_soms_cms, echo=FALSE}
|
||||
```{r soms-output, echo=FALSE}
|
||||
|
||||
#table(test_set$label, ransomware_group.prediction$prediction[[2]]) %>% knitr::kable()
|
||||
message("A grid size of ", grid_size, " has been chosen.")
|
||||
|
||||
#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]]) %>% knitr::kable()
|
||||
table(test_set$label, ransomware_group.prediction$prediction[[2]])
|
||||
|
||||
#These re good plots, fix their display somehow...
|
||||
# Visualize clusters
|
||||
#plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
cm_labels
|
||||
|
||||
# Distance map
|
||||
#plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize counts
|
||||
#plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize fan diagram
|
||||
#plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 1
|
||||
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1], main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 2
|
||||
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2], main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 3
|
||||
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3], main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 4
|
||||
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4], main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
|
||||
# Visualize heatmap for variable 5
|
||||
#plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5], main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
|
||||
# cat(" \n")
|
||||
message("Overall accuracy is ", cm_labels$overall["Accuracy"])
|
||||
|
||||
|
||||
# Confusion Matrix
|
||||
cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
|
||||
cm_labels$overall
|
||||
```
|
||||
### Clustering Visualizations: K-means clustering
|
||||
|
||||
```{r clustering-setup, echo=FALSE, include=FALSE}
|
||||
#############################################################################
|
||||
## K-Means Clustering to visualize the categorization of the SOM
|
||||
## For a good tutorial, visit:
|
||||
## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
||||
#############################################################################
|
||||
|
||||
# Confusion Matrix
|
||||
cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
|
||||
cm_labels.validation$overall
|
||||
|
||||
# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
|
||||
# Set number of clusters to be equal to number of known ransomware groups
|
||||
n_groups <- length(unique(ransomware$label)) - 1
|
||||
n_groups
|
||||
|
||||
# K-Means Clustering
|
||||
# https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
|
||||
|
||||
# Generate k-means clustering
|
||||
som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
|
||||
|
||||
```
|
||||
|
||||
```{r clustering-plot, echo=FALSE}
|
||||
# Plot clustering results
|
||||
plot(som_model2,
|
||||
main = 'K-Means Clustering',
|
||||
type = "property",
|
||||
|
@ -775,19 +556,7 @@ add.cluster.boundaries(som_model2, som.cluster$cluster)
|
|||
|
||||
```
|
||||
|
||||
### Final Method: Combined Methods 1 and 3
|
||||
|
||||
Using the results from Random Forest, isolate the black addresses first, and then run that subset through an SOM algorithm. Compare final results to original paper. These go in a "results" section. (below)
|
||||
|
||||
```{r combined_methods, echo=FALSE}
|
||||
# Do this here
|
||||
|
||||
# Still need to put it all into one script, and then reproduce the results here....
|
||||
|
||||
|
||||
```
|
||||
|
||||
## Results & Performance
|
||||
## Results & Performance (chunk #4, write up after chunk #3 is done)
|
||||
|
||||
### Results
|
||||
|
||||
|
@ -839,7 +608,7 @@ Kyle Hogan, Jason Hennessey, Andrew Miller, Arvind Narayanan, and Nicolas Christ
|
|||
|
||||
```{r end timer, echo=FALSE}
|
||||
# End timer
|
||||
message("....that's all, folks!")
|
||||
toc()
|
||||
|
||||
|
||||
```
|
Binary file not shown.
Loading…
Reference in New Issue