2021-09-20 05:52:11 +02:00
|
|
|
# Install necessary packages
|
|
|
|
if(!require(tidyverse)) install.packages("tidyverse", repos = "http://cran.us.r-project.org")
|
|
|
|
if(!require(caret)) install.packages("caret", repos = "http://cran.us.r-project.org")
|
|
|
|
|
|
|
|
# Load Libraries
|
|
|
|
library(tidyverse)
|
|
|
|
library(caret)
|
|
|
|
|
|
|
|
# Download data
|
|
|
|
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
|
|
|
|
dest_file <- "data/data.zip"
|
|
|
|
if(!dir.exists("data"))dir.create("data")
|
|
|
|
if(!file.exists(dest_file))download.file(url, destfile = dest_file)
|
|
|
|
|
|
|
|
# Unzip
|
|
|
|
if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
|
|
|
|
|
|
|
|
# Import data from CSV
|
|
|
|
ransomware <- read_csv("data/BitcoinHeistData.csv")
|
|
|
|
|
2021-09-27 11:12:00 +02:00
|
|
|
# Turn labels into factors, grey is a binary factor for ransomware/non-ransomware
|
|
|
|
ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")))
|
|
|
|
|
|
|
|
# Sample every other row (keeping it simple for now)
|
|
|
|
#ransomware <- ransomware[seq(1, nrow(ransomware), 2), ]
|
|
|
|
|
|
|
|
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey)
|
|
|
|
#test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE)
|
|
|
|
|
|
|
|
# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
|
|
|
|
test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE)
|
|
|
|
|
2021-09-20 05:52:11 +02:00
|
|
|
workset <- ransomware[-test_index,]
|
|
|
|
temp <- ransomware[test_index,]
|
|
|
|
|
2021-09-27 11:12:00 +02:00
|
|
|
# Make sure addresses in validation set are also in working set...
|
|
|
|
# validation <- temp %>%
|
|
|
|
# semi_join(workset, by = "address")
|
|
|
|
|
|
|
|
# ... Or not
|
|
|
|
validation <- temp
|
|
|
|
|
|
|
|
# Add rows removed from validation set back into working set...
|
|
|
|
#removed <- anti_join(temp, validation)
|
|
|
|
#workset <- rbind(workset, removed)
|
2021-09-20 05:52:11 +02:00
|
|
|
|
2021-09-27 11:12:00 +02:00
|
|
|
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey)
|
|
|
|
#test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE)
|
|
|
|
|
|
|
|
# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
|
|
|
|
test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
|
2021-09-20 05:52:11 +02:00
|
|
|
|
|
|
|
train_set <- workset[-test_index,]
|
2021-09-27 11:12:00 +02:00
|
|
|
temp <- workset[test_index,]
|
|
|
|
|
|
|
|
# Make sure addresses in validation set are also in working set....
|
|
|
|
#test_set <- temp %>%
|
|
|
|
# semi_join(train_set, by = "address")
|
|
|
|
|
|
|
|
# Add rows removed from validation set back into working set....
|
|
|
|
#removed <- anti_join(temp, test_set)
|
|
|
|
#train_set <- rbind(train_set, removed)
|
|
|
|
|
|
|
|
# ....Or not
|
|
|
|
test_set <- temp
|
2021-09-20 05:52:11 +02:00
|
|
|
|
|
|
|
# Clean up environment
|
2021-09-27 11:12:00 +02:00
|
|
|
rm(dest_file, url, temp, ransomware)
|
2021-09-20 05:52:11 +02:00
|
|
|
|
|
|
|
# Inspect data frames
|
|
|
|
test_set %>% str()
|
|
|
|
test_set %>% head()
|
|
|
|
train_set %>% str()
|
|
|
|
train_set %>% head()
|
2021-09-20 06:00:47 +02:00
|
|
|
|
2021-09-27 11:12:00 +02:00
|
|
|
## Data preparation is done, now focusing on Self Organizing Maps as our method
|
|
|
|
|