changed instances of "grey" to "bw". Started visualization section of report. Mars needs better graphs!

This commit is contained in:
shelldweller 2021-10-14 23:55:01 -06:00
parent d89266b7a4
commit 76f6ffb677
7 changed files with 153 additions and 96 deletions

View File

@ -24,14 +24,14 @@ test_mat <- as.matrix(scale(test_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_set$grey %>% classvec2classmat()
train_bw <- train_set$bw %>% classvec2classmat()
# Same for test set
test_grey <- test_set$grey %>% classvec2classmat()
test_bw <- test_set$bw %>% classvec2classmat()
# Create Data list for supervised SOM
#
train_list <- list(independent = train_mat, dependent = train_grey)
train_list <- list(independent = train_mat, dependent = train_bw)
# Calculate idea grid size according to:
# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
@ -39,7 +39,7 @@ train_list <- list(independent = train_mat, dependent = train_grey)
# Formulaic method 1
grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
# Based on categorical number, method 2
#grid_size = ceiling(sqrt(length(unique(ransomware$grey))))
#grid_size = ceiling(sqrt(length(unique(ransomware$bw))))
grid_size
# Create SOM grid
@ -49,7 +49,7 @@ train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal
set.seed(5)
## Now build the model.
som_model <- xyf(train_mat, train_grey,
som_model <- xyf(train_mat, train_bw,
grid = train_grid,
rlen = 100,
mode="pbatch", # or: alpha = c(0.05,0.01),
@ -87,15 +87,15 @@ plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=col
# Now test predictions
# https://clarkdatalabs.github.io/soms/SOM_NBA
test_list <- list(independent = test_mat, dependent = test_grey)
test_list <- list(independent = test_mat, dependent = test_bw)
ransomware.prediction <- predict(som_model, newdata = test_list)
table(test_set$grey, ransomware.prediction$prediction[[2]])
table(test_set$bw, ransomware.prediction$prediction[[2]])
# Confusion Matrix
cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey)
cm_grey$overall["Accuracy"]
cm_grey
cm_bw <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$bw)
cm_bw$overall["Accuracy"]
cm_bw
# Now test predictions of validation set
@ -106,18 +106,18 @@ valid_num <- validation %>% select(length, weight, count, neighbors, income)
valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat,
"scaled:center"), scale = attr(train_mat, "scaled:scale")))
valid_grey <- validation$grey
valid_bw <- validation$bw
valid_list <- list(independent = valid_mat, dependent = valid_grey)
valid_list <- list(independent = valid_mat, dependent = valid_bw)
# Requires up to 16GB of RAM, skip if resources are limited
ransomware.prediction.validation <- predict(som_model, newdata = valid_list)
table(validation$grey, ransomware.prediction.validation$prediction[[2]])
table(validation$bw, ransomware.prediction.validation$prediction[[2]])
# Confusion Matrix
cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey)
cm_grey.validation$overall["Accuracy"]
cm_grey.validation
cm_bw.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$bw)
cm_bw.validation$overall["Accuracy"]
cm_bw.validation
# Clean up environment
rm(dest_file, url, temp, grid_size)

View File

@ -23,7 +23,7 @@ temp <- blacks[test_index,]
# ... Or not
validation_blacks <- temp
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
set.seed(5)
test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)

View File

@ -18,17 +18,17 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.
# Import data from CSV
ransomware <- read_csv("data/BitcoinHeistData.csv")
# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]

View File

@ -55,18 +55,18 @@ knitr::knit_hooks$set(chunk = function(x, options) {
With the graph defined as such, the following six numerical features$^{[2]}$ are associated with a given address:
1) Income - the total amount of coins sent to an address (decimal value with 8 decimal places)
1) *Income* - the total amount of coins sent to an address (decimal value with 8 decimal places)
2) Neighbors - the number of transactions that have this address as one of its output addresses (integer)
2) *Neighbors* - the number of transactions that have this address as one of its output addresses (integer)
3) Weight - the sum of fraction of coins that reach this address from address that do not have any other inputs within the 24-hour window, which are referred to as "starter transactions" (decimal value)
3) *Weight* - the sum of fraction of coins that reach this address from address that do not have any other inputs within the 24-hour window, which are referred to as "starter transactions" (decimal value)
4) Length - the number of non-starter transactions on its longest chain, where a chain is defined as an
4) *Length* - the number of non-starter transactions on its longest chain, where a chain is defined as an
acyclic directed path originating from any starter transaction and ending at the address in question (integer)
5) Count - The number of starter addresses connected to this address through a chain (integer)
5) *Count* - The number of starter addresses connected to this address through a chain (integer)
6) Loop - The number of starter addresses connected to this address by more than one path (integer)
6) *Loop* - The number of starter addresses connected to this address by more than one path (integer)
These variables are defined rather abstractly, viewing the blockchain as a topological graph with nodes and edges. The rationale behind this approach is to quantify specific transaction patterns. Akcora$^{[3]}$ gives a thorough explanation in the original paper of how and why these features were chosen. We shall treat the features as general numerical variables and will not seek to justify their definitions. Several machine learning methods will be applied to the original data set from the paper by Akcora$^{[3]}$, and the results will be compared.
@ -144,42 +144,23 @@ The original research team downloaded and parsed the entire Bitcoin transaction
---
## Data Analysis (chunk #2)
## Data Analysis
### Hardware
### Hardware Specification
List computer specs here. Laptop, OS, and R versions.
All of the analysis in this report was conducted on a single laptop computer, a Lenovo Yoga S1 from late 2013 with the following specs:
- CPU: Intel i7-4600U @ 3.300GHz (4th Gen quad-core i7)
- RAM: 8217MB DDR3L @ 1600 MHz (8 GB)
- OS: Slackware64-current (15.0 RC1) `x86_64-slackware-linux-gnu` (64-bit GNU/Linux)
- R version 4.0.0 (2020-04-24) -- "Arbor Day" (built from source using scripts from [slackbuilds.org](https://slackbuilds.org/))
- RStudio Version 1.4.1106 "Tiger Daylily" (2389bc24, 2021-02-11) for CentOS 8 (converted using `rpm2tgz`)
### Data Preparation
What did I do to prepare the data? Factoring the labels. Adding the b/w label. Splitting into partitions (twice) to reduce set size. Etc..... (see code).
It is immediately apparent that this is a rather large data set. The usual practice of partitioning out eighty to ninety percent of the data for a training set results in a data set that is too large to process given the hardware available. For reasons that no longer apply, the original data set was first split in half with 50% reserved as "validation set" and the other 50% used as the "working set". This working set was again split in half, to give a "training set" that was of a reasonable size to deal with. At this point the partitions were small enough to work with, so the sample partitions were not further refined. This is a potential area for later optimization. Careful sampling was carried out to ensure that the ransomware groups were represented in each sample.
### Exploration and Visualization
I need better graphs. I have plenty, but I need them to look better and/or have more labels, etc.
Ideas:
1) Show skewness of the non-temporal variables.
2) Show the rarity of the target addresses.
3) Note how sparse some of the groups are.
4) List group counts in a table
5) Check for missing values / NAs.
6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here.
7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!)
```{r visuals, echo=FALSE, include=FALSE}
# Do some graphical exploration before applying any models.
# Look at the example work for some ideas.
# Add any compelling visuals as needed here.
```{r dataprep, echo=FALSE, include=FALSE}
# ?? Cluster graphs go at the end.
@ -190,17 +171,17 @@ if(!require(matrixStats)) install.packages("matrixStats")
library(matrixStats)
# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
workset <- ransomware[-test_index,]
validation <- ransomware[test_index,]
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
train_set <- workset[-test_index,]
test_set <- workset[test_index,]
@ -208,47 +189,121 @@ test_set <- workset[test_index,]
# Clean up environment
rm(dest_file, url)
## Principle Component Analysis
names(ransomware)
str(ransomware)
#Sample every nth row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# What percentage of sample is ransomware?
mean(train_samp$grey=="black")
# Keep only numeric columns
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
# Keep only numeric columns
train_scaled <- train_num %>% scale()
# Find proportion of full data set that is ransomware
ransomprop <- mean(ransomware$bw=="black")
# Check for NAs
no_nas <- sum(is.na(ransomware))
```
### Exploration and Visualization
The ransomware addresses make up less than 2% of the overall data set. This presents a challenge as the target observations are sparse within the data set, especially when we consider that this is then divided into 29 subsets. In fact, some of the ransomware groups have only a single member, making categorization a dubious task. At least there are no missing values to worry about.
```{r data-sparsness, echo=FALSE}
message("The proportion of ransomware addresses in the original data set is ", ransomprop, ".")
message("The total number of NA or missing values in the original data set is ", no_nas, ".")
labels <- ransomware$label %>% summary()
knitr::kable(
list(labels[1:15], labels[16:29]),
caption = 'Ransomware group labels and frequency counts for full data set',
booktabs = TRUE
)
```
Let's take a look at the distribution of the different features. Note how skewed the non-temporal features are, some of them being bimodal:
```{r histograms, echo=FALSE}
# Histograms of each of the columns to show skewness
train_num$year %>% hist(main = paste("Histogram of","year"))
# Plot histograms for each column using facet wrap
train_num$day %>% hist(main = paste("Histogram of","day"))
train_long <- train_num %>% # Apply pivot_longer function
pivot_longer(colnames(train_num)) %>%
as.data.frame()
train_num$length %>% hist(main = paste("Histogram of","length"))
# Histograms per column
ggp1 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram
geom_histogram(aes(y = ..density..), bins=20) +
geom_density(col = "green", size = .5) +
facet_wrap(~ name, scales = "free")
ggp1
train_num$weight %>% hist(main = paste("Histogram of","weight"))
# Log scale on value axis, does not make sense for temporal variables
ggp2 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram
geom_histogram(aes(y = ..density..), bins=20) +
geom_density(col = "green", size = .5) +
scale_x_continuous(trans='log2') +
facet_wrap(~ name, scales = "free")
ggp2
train_num$count %>% hist(main = paste("Histogram of","count"))
```
train_num$looped %>% hist(main = paste("Histogram of","looped"))
train_num$neighbors %>% hist(main = paste("Histogram of","neighbors"))
train_num$income %>% hist(main = paste("Histogram of","income"))
Now we can compare the relative spread of each feature by calculating the coefficient of variation for each column. Larger coefficients of variation indicate larger relative spread compared to other columns.
```{r sds, echo=FALSE}
# Check for variability across numerical columns using coefficients of variation
# Calculate standard deviations for each column
sds <- train_num %>% as.matrix() %>% colSds()
# Calculate means for each column
means <- train_num %>% as.matrix() %>% colMeans()
# Calculate CVs for each column
coeff_vars <- sds %/% means
# Summarize results in a table and a plot
knitr::kable(coeff_vars)
plot(coeff_vars)
coeff_vars
```
From this, it appears that *income* has the widest range of variability, followed by *neighbors*. These are also the features that are most strongly skewed to the right, meaning that a few addresses have really high values for each of these features while the bulk of the data set has very low values for these numbers.
Now do the following:
6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. MORE OF THIS....
```{r percent per column, echo=FALSE}
# Do this here
shrimp <- train_samp %>% filter(income < 10^8 )
mean(shrimp$bw == "black")
```
7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!)
```{r pca, echo=FALSE, include=FALSE}
# View distances between points of a sample to look for patterns
# This one seems to be problematic unless I can make the image smaller somehow...
@ -265,9 +320,9 @@ pc <- 1:ncol(train_scaled)
qplot(pc, pca$sdev)
# Plot the first two PCs with color representing black/white
data.frame(pca$x[,1:2], Grey=train_samp$grey) %>%
data.frame(pca$x[,1:2], bw=train_samp$bw) %>%
sample_n(200) %>%
ggplot(aes(PC1,PC2, fill = Grey))+
ggplot(aes(PC1,PC2, fill = bw))+
geom_point(cex=3, pch=21) +
coord_fixed(ratio = 1)
@ -283,6 +338,8 @@ rm(pca, x, coeff_vars, d, means, pc, sds)
### Insights Gained from Exploration
Maybe its better to approach this as a binary problem? At least at first, lets see how far that gets us....
## Modeling approach (Chunk #3, mostly done, just need to clean up a bit)
@ -294,7 +351,7 @@ rm(pca, x, coeff_vars, d, means, pc, sds)
### Method 1: Binary Random Forests
If we ask a simpler question, is this a useful approach? Mentioned to not work well in original paper. Try it using a binary black/white approach. change all instances of "grey" in the code to "bw". show how this simplification leads to (near)-perfect accuracy. Confusion Matrix?
If we ask a simpler question, is this a useful approach? Mentioned to not work well in original paper. Try it using a binary black/white approach. change all instances of "bw" in the code to "bw". show how this simplification leads to (near)-perfect accuracy. Confusion Matrix?
### Method 2: Binary SOMs

Binary file not shown.

View File

@ -8,7 +8,7 @@ library(randomForest)
train_num <- train_samp %>% select(neighbors, income)
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_grey <- train_samp$grey
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
set.seed(5)
@ -18,21 +18,21 @@ test_samp <- test_set[seq(1, nrow(train_set), 100), ]
test_num <- test_samp %>% select(neighbors, income)
# Same for test set
test_grey <- test_samp$grey
test_bw <- test_samp$bw
# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid)
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
# Check for best tuning parameters
ggplot(rf_model)
rf_model$bestTune
# Fit model
fit_rf <- randomForest(train_samp, train_grey,
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
# Check for enough trees
@ -40,14 +40,14 @@ plot(fit_rf)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_grey)
cm <- confusionMatrix(y_hat_rf, test_bw)
cm$overall["Accuracy"]
cm
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$grey)
cm <- confusionMatrix(y_hat_rf, validation$bw)
cm$overall["Accuracy"]
cm

View File

@ -19,7 +19,7 @@ str(ransomware)
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# What percentage of sample is ransomware?
mean(train_samp$grey=="black")
mean(train_samp$bw=="black")
# Keep only numeric columns
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
@ -59,9 +59,9 @@ pc <- 1:ncol(train_scaled)
qplot(pc, pca$sdev)
# Plot the first two PCs with color representing black/white
data.frame(pca$x[,1:2], Grey=train_samp$grey) %>%
data.frame(pca$x[,1:2], bw=train_samp$bw) %>%
sample_n(200) %>%
ggplot(aes(PC1,PC2, fill = Grey))+
ggplot(aes(PC1,PC2, fill = bw))+
geom_point(cex=3, pch=21) +
coord_fixed(ratio = 1)