diff --git a/Binary_SOM.R b/Binary_SOM.R index f7b0b43..f9ce895 100644 --- a/Binary_SOM.R +++ b/Binary_SOM.R @@ -24,14 +24,14 @@ test_mat <- as.matrix(scale(test_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) # Binary outputs, black=ransomware, white=non-ransomware, train set -train_grey <- train_set$grey %>% classvec2classmat() +train_bw <- train_set$bw %>% classvec2classmat() # Same for test set -test_grey <- test_set$grey %>% classvec2classmat() +test_bw <- test_set$bw %>% classvec2classmat() # Create Data list for supervised SOM # -train_list <- list(independent = train_mat, dependent = train_grey) +train_list <- list(independent = train_mat, dependent = train_bw) # Calculate idea grid size according to: # https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps @@ -39,7 +39,7 @@ train_list <- list(independent = train_mat, dependent = train_grey) # Formulaic method 1 grid_size <- round(sqrt(5*sqrt(nrow(train_set)))) # Based on categorical number, method 2 -#grid_size = ceiling(sqrt(length(unique(ransomware$grey)))) +#grid_size = ceiling(sqrt(length(unique(ransomware$bw)))) grid_size # Create SOM grid @@ -49,7 +49,7 @@ train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal set.seed(5) ## Now build the model. -som_model <- xyf(train_mat, train_grey, +som_model <- xyf(train_mat, train_bw, grid = train_grid, rlen = 100, mode="pbatch", # or: alpha = c(0.05,0.01), @@ -87,15 +87,15 @@ plot(som_model, type = 'property', property = som_model$codes[[1]][,5], main=col # Now test predictions # https://clarkdatalabs.github.io/soms/SOM_NBA -test_list <- list(independent = test_mat, dependent = test_grey) +test_list <- list(independent = test_mat, dependent = test_bw) ransomware.prediction <- predict(som_model, newdata = test_list) -table(test_set$grey, ransomware.prediction$prediction[[2]]) +table(test_set$bw, ransomware.prediction$prediction[[2]]) # Confusion Matrix -cm_grey <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$grey) -cm_grey$overall["Accuracy"] -cm_grey +cm_bw <- confusionMatrix(ransomware.prediction$prediction[[2]], test_set$bw) +cm_bw$overall["Accuracy"] +cm_bw # Now test predictions of validation set @@ -106,18 +106,18 @@ valid_num <- validation %>% select(length, weight, count, neighbors, income) valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, "scaled:center"), scale = attr(train_mat, "scaled:scale"))) -valid_grey <- validation$grey +valid_bw <- validation$bw -valid_list <- list(independent = valid_mat, dependent = valid_grey) +valid_list <- list(independent = valid_mat, dependent = valid_bw) # Requires up to 16GB of RAM, skip if resources are limited ransomware.prediction.validation <- predict(som_model, newdata = valid_list) -table(validation$grey, ransomware.prediction.validation$prediction[[2]]) +table(validation$bw, ransomware.prediction.validation$prediction[[2]]) # Confusion Matrix -cm_grey.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$grey) -cm_grey.validation$overall["Accuracy"] -cm_grey.validation +cm_bw.validation <- confusionMatrix(ransomware.prediction.validation$prediction[[2]], validation$bw) +cm_bw.validation$overall["Accuracy"] +cm_bw.validation # Clean up environment rm(dest_file, url, temp, grid_size) \ No newline at end of file diff --git a/Categorical_SOM.R b/Categorical_SOM.R index af69a3b..58ee21a 100644 --- a/Categorical_SOM.R +++ b/Categorical_SOM.R @@ -23,7 +23,7 @@ temp <- blacks[test_index,] # ... Or not validation_blacks <- temp -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) set.seed(5) test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE) diff --git a/Data_Prep.R b/Data_Prep.R index 1f8fe8c..326b905 100644 --- a/Data_Prep.R +++ b/Data_Prep.R @@ -18,17 +18,17 @@ if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData. # Import data from CSV ransomware <- read_csv("data/BitcoinHeistData.csv") -# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware -ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) +# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware +ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black"))) -# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) -test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw) +test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] validation <- ransomware[test_index,] -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) -test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) +test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] test_set <- workset[test_index,] diff --git a/Detecting_Bitcoin_Ransomware.Rmd b/Detecting_Bitcoin_Ransomware.Rmd index ddae1d9..a81625d 100644 --- a/Detecting_Bitcoin_Ransomware.Rmd +++ b/Detecting_Bitcoin_Ransomware.Rmd @@ -55,18 +55,18 @@ knitr::knit_hooks$set(chunk = function(x, options) { With the graph defined as such, the following six numerical features$^{[2]}$ are associated with a given address: - 1) Income - the total amount of coins sent to an address (decimal value with 8 decimal places) + 1) *Income* - the total amount of coins sent to an address (decimal value with 8 decimal places) - 2) Neighbors - the number of transactions that have this address as one of its output addresses (integer) + 2) *Neighbors* - the number of transactions that have this address as one of its output addresses (integer) - 3) Weight - the sum of fraction of coins that reach this address from address that do not have any other inputs within the 24-hour window, which are referred to as "starter transactions" (decimal value) + 3) *Weight* - the sum of fraction of coins that reach this address from address that do not have any other inputs within the 24-hour window, which are referred to as "starter transactions" (decimal value) - 4) Length - the number of non-starter transactions on its longest chain, where a chain is defined as an + 4) *Length* - the number of non-starter transactions on its longest chain, where a chain is defined as an acyclic directed path originating from any starter transaction and ending at the address in question (integer) - 5) Count - The number of starter addresses connected to this address through a chain (integer) + 5) *Count* - The number of starter addresses connected to this address through a chain (integer) - 6) Loop - The number of starter addresses connected to this address by more than one path (integer) + 6) *Loop* - The number of starter addresses connected to this address by more than one path (integer) These variables are defined rather abstractly, viewing the blockchain as a topological graph with nodes and edges. The rationale behind this approach is to quantify specific transaction patterns. Akcora$^{[3]}$ gives a thorough explanation in the original paper of how and why these features were chosen. We shall treat the features as general numerical variables and will not seek to justify their definitions. Several machine learning methods will be applied to the original data set from the paper by Akcora$^{[3]}$, and the results will be compared. @@ -144,42 +144,23 @@ The original research team downloaded and parsed the entire Bitcoin transaction --- -## Data Analysis (chunk #2) +## Data Analysis -### Hardware +### Hardware Specification -List computer specs here. Laptop, OS, and R versions. + All of the analysis in this report was conducted on a single laptop computer, a Lenovo Yoga S1 from late 2013 with the following specs: + + - CPU: Intel i7-4600U @ 3.300GHz (4th Gen quad-core i7) + - RAM: 8217MB DDR3L @ 1600 MHz (8 GB) + - OS: Slackware64-current (15.0 RC1) `x86_64-slackware-linux-gnu` (64-bit GNU/Linux) + - R version 4.0.0 (2020-04-24) -- "Arbor Day" (built from source using scripts from [slackbuilds.org](https://slackbuilds.org/)) + - RStudio Version 1.4.1106 "Tiger Daylily" (2389bc24, 2021-02-11) for CentOS 8 (converted using `rpm2tgz`) ### Data Preparation - What did I do to prepare the data? Factoring the labels. Adding the b/w label. Splitting into partitions (twice) to reduce set size. Etc..... (see code). + It is immediately apparent that this is a rather large data set. The usual practice of partitioning out eighty to ninety percent of the data for a training set results in a data set that is too large to process given the hardware available. For reasons that no longer apply, the original data set was first split in half with 50% reserved as "validation set" and the other 50% used as the "working set". This working set was again split in half, to give a "training set" that was of a reasonable size to deal with. At this point the partitions were small enough to work with, so the sample partitions were not further refined. This is a potential area for later optimization. Careful sampling was carried out to ensure that the ransomware groups were represented in each sample. -### Exploration and Visualization - - I need better graphs. I have plenty, but I need them to look better and/or have more labels, etc. - - Ideas: - - 1) Show skewness of the non-temporal variables. - - 2) Show the rarity of the target addresses. - - 3) Note how sparse some of the groups are. - - 4) List group counts in a table - - 5) Check for missing values / NAs. - - 6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. - - - 7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!) - - -```{r visuals, echo=FALSE, include=FALSE} -# Do some graphical exploration before applying any models. -# Look at the example work for some ideas. -# Add any compelling visuals as needed here. +```{r dataprep, echo=FALSE, include=FALSE} # ?? Cluster graphs go at the end. @@ -190,17 +171,17 @@ if(!require(matrixStats)) install.packages("matrixStats") library(matrixStats) -# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware -ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black"))) +# Turn labels into factors, bw is a binary factor for ransomware/non-ransomware +ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black"))) -# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) -test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) +# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw) +test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] validation <- ransomware[test_index,] -# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) -test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) +# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw) +test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] test_set <- workset[test_index,] @@ -208,47 +189,121 @@ test_set <- workset[test_index,] # Clean up environment rm(dest_file, url) -## Principle Component Analysis - -names(ransomware) -str(ransomware) - #Sample every nth row due to memory constraints train_samp <- train_set[seq(1, nrow(train_set), 100), ] -# What percentage of sample is ransomware? -mean(train_samp$grey=="black") - # Keep only numeric columns train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income) # Keep only numeric columns train_scaled <- train_num %>% scale() +# Find proportion of full data set that is ransomware + +ransomprop <- mean(ransomware$bw=="black") + +# Check for NAs + +no_nas <- sum(is.na(ransomware)) + + +``` + + +### Exploration and Visualization + + The ransomware addresses make up less than 2% of the overall data set. This presents a challenge as the target observations are sparse within the data set, especially when we consider that this is then divided into 29 subsets. In fact, some of the ransomware groups have only a single member, making categorization a dubious task. At least there are no missing values to worry about. + +```{r data-sparsness, echo=FALSE} + +message("The proportion of ransomware addresses in the original data set is ", ransomprop, ".") + +message("The total number of NA or missing values in the original data set is ", no_nas, ".") + +labels <- ransomware$label %>% summary() + +knitr::kable( + list(labels[1:15], labels[16:29]), + caption = 'Ransomware group labels and frequency counts for full data set', + booktabs = TRUE + +) + + +``` + +Let's take a look at the distribution of the different features. Note how skewed the non-temporal features are, some of them being bimodal: + +```{r histograms, echo=FALSE} # Histograms of each of the columns to show skewness -train_num$year %>% hist(main = paste("Histogram of","year")) +# Plot histograms for each column using facet wrap -train_num$day %>% hist(main = paste("Histogram of","day")) +train_long <- train_num %>% # Apply pivot_longer function + pivot_longer(colnames(train_num)) %>% + as.data.frame() -train_num$length %>% hist(main = paste("Histogram of","length")) +# Histograms per column +ggp1 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram + geom_histogram(aes(y = ..density..), bins=20) + + geom_density(col = "green", size = .5) + + facet_wrap(~ name, scales = "free") +ggp1 -train_num$weight %>% hist(main = paste("Histogram of","weight")) +# Log scale on value axis, does not make sense for temporal variables +ggp2 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram + geom_histogram(aes(y = ..density..), bins=20) + + geom_density(col = "green", size = .5) + + scale_x_continuous(trans='log2') + + facet_wrap(~ name, scales = "free") +ggp2 -train_num$count %>% hist(main = paste("Histogram of","count")) +``` -train_num$looped %>% hist(main = paste("Histogram of","looped")) - -train_num$neighbors %>% hist(main = paste("Histogram of","neighbors")) - -train_num$income %>% hist(main = paste("Histogram of","income")) +Now we can compare the relative spread of each feature by calculating the coefficient of variation for each column. Larger coefficients of variation indicate larger relative spread compared to other columns. +```{r sds, echo=FALSE} # Check for variability across numerical columns using coefficients of variation + +# Calculate standard deviations for each column sds <- train_num %>% as.matrix() %>% colSds() + +# Calculate means for each column means <- train_num %>% as.matrix() %>% colMeans() + +# Calculate CVs for each column coeff_vars <- sds %/% means + +# Summarize results in a table and a plot +knitr::kable(coeff_vars) + plot(coeff_vars) -coeff_vars + +``` + +From this, it appears that *income* has the widest range of variability, followed by *neighbors*. These are also the features that are most strongly skewed to the right, meaning that a few addresses have really high values for each of these features while the bulk of the data set has very low values for these numbers. + + +Now do the following: + + + 6) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. MORE OF THIS.... + + +```{r percent per column, echo=FALSE} +# Do this here + +shrimp <- train_samp %>% filter(income < 10^8 ) + +mean(shrimp$bw == "black") + +``` + + + 7) Principle Component Analysis can go here. See "Interlinkages of Malaysian Banking Systems" for an example of detailed PCA. Is it exploratory analysis, or is it a predictive method? I was under the assumption that it is a form of analysis, but the paper mentioned extends it to a form of predictive modeling. How to do this *right* (?!?!) + + +```{r pca, echo=FALSE, include=FALSE} # View distances between points of a sample to look for patterns # This one seems to be problematic unless I can make the image smaller somehow... @@ -265,9 +320,9 @@ pc <- 1:ncol(train_scaled) qplot(pc, pca$sdev) # Plot the first two PCs with color representing black/white -data.frame(pca$x[,1:2], Grey=train_samp$grey) %>% +data.frame(pca$x[,1:2], bw=train_samp$bw) %>% sample_n(200) %>% - ggplot(aes(PC1,PC2, fill = Grey))+ + ggplot(aes(PC1,PC2, fill = bw))+ geom_point(cex=3, pch=21) + coord_fixed(ratio = 1) @@ -283,6 +338,8 @@ rm(pca, x, coeff_vars, d, means, pc, sds) ### Insights Gained from Exploration Maybe its better to approach this as a binary problem? At least at first, lets see how far that gets us.... + + ## Modeling approach (Chunk #3, mostly done, just need to clean up a bit) @@ -294,7 +351,7 @@ rm(pca, x, coeff_vars, d, means, pc, sds) ### Method 1: Binary Random Forests -If we ask a simpler question, is this a useful approach? Mentioned to not work well in original paper. Try it using a binary black/white approach. change all instances of "grey" in the code to "bw". show how this simplification leads to (near)-perfect accuracy. Confusion Matrix? +If we ask a simpler question, is this a useful approach? Mentioned to not work well in original paper. Try it using a binary black/white approach. change all instances of "bw" in the code to "bw". show how this simplification leads to (near)-perfect accuracy. Confusion Matrix? ### Method 2: Binary SOMs diff --git a/Detecting_Bitcoin_Ransomware.pdf b/Detecting_Bitcoin_Ransomware.pdf index e883f86..0abeb77 100644 Binary files a/Detecting_Bitcoin_Ransomware.pdf and b/Detecting_Bitcoin_Ransomware.pdf differ diff --git a/RanFor.R b/RanFor.R index 7d1bd31..31ca6df 100644 --- a/RanFor.R +++ b/RanFor.R @@ -8,7 +8,7 @@ library(randomForest) train_num <- train_samp %>% select(neighbors, income) # Binary outputs, black=ransomware, white=non-ransomware, train set -train_grey <- train_samp$grey +train_bw <- train_samp$bw #Sample every nth row due to memory constraints set.seed(5) @@ -18,21 +18,21 @@ test_samp <- test_set[seq(1, nrow(train_set), 100), ] test_num <- test_samp %>% select(neighbors, income) # Same for test set -test_grey <- test_samp$grey +test_bw <- test_samp$bw # Lower CV numbers control <- trainControl(method="cv", number = 10) grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) # Train Random Forests model -rf_model <- train(train_num, train_grey, method="rf", trControl = control, tuneGrid=grid) +rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid) # Check for best tuning parameters ggplot(rf_model) rf_model$bestTune # Fit model -fit_rf <- randomForest(train_samp, train_grey, +fit_rf <- randomForest(train_samp, train_bw, minNode = rf_model$bestTune$mtry) # Check for enough trees @@ -40,14 +40,14 @@ plot(fit_rf) # Measure accuracy of model against test sample y_hat_rf <- predict(fit_rf, test_samp) -cm <- confusionMatrix(y_hat_rf, test_grey) +cm <- confusionMatrix(y_hat_rf, test_bw) cm$overall["Accuracy"] cm # Measure accuracy of model against full validation set y_hat_rf <- predict(fit_rf, validation) -cm <- confusionMatrix(y_hat_rf, validation$grey) +cm <- confusionMatrix(y_hat_rf, validation$bw) cm$overall["Accuracy"] cm diff --git a/Visuals.R b/Visuals.R index 28f7d6e..4738e70 100644 --- a/Visuals.R +++ b/Visuals.R @@ -19,7 +19,7 @@ str(ransomware) train_samp <- train_set[seq(1, nrow(train_set), 100), ] # What percentage of sample is ransomware? -mean(train_samp$grey=="black") +mean(train_samp$bw=="black") # Keep only numeric columns train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income) @@ -59,9 +59,9 @@ pc <- 1:ncol(train_scaled) qplot(pc, pca$sdev) # Plot the first two PCs with color representing black/white -data.frame(pca$x[,1:2], Grey=train_samp$grey) %>% +data.frame(pca$x[,1:2], bw=train_samp$bw) %>% sample_n(200) %>% - ggplot(aes(PC1,PC2, fill = Grey))+ + ggplot(aes(PC1,PC2, fill = bw))+ geom_point(cex=3, pch=21) + coord_fixed(ratio = 1)