# Install necessary packages if(!require(tidyverse)) install.packages("tidyverse") if(!require(caret)) install.packages("caret") # Load Libraries library(tidyverse) library(caret) # Download data url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip" dest_file <- "data/data.zip" if(!dir.exists("data"))dir.create("data") if(!file.exists(dest_file))download.file(url, destfile = dest_file) # Unzip if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data") # Import data from CSV ransomware <- read_csv("data/BitcoinHeistData.csv") # Turn labels into factors, grey is a binary factor for ransomware/non-ransomware ransomware <- ransomware %>% mutate(label=as.factor(label), grey=as.factor(ifelse(label=="white", "white", "black")), address=as.factor(address)) # Sample every other row (keeping it simple for now) #ransomware <- ransomware[seq(1, nrow(ransomware), 2), ] # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (grey) test_index <- createDataPartition(y = ransomware$grey, times = 1, p = .5, list = FALSE) # Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes #test_index <- createDataPartition(y = ransomware$label, times = 1, p = .5, list = FALSE) workset <- ransomware[-test_index,] temp <- ransomware[test_index,] # Make sure addresses in validation set are also in working set... # validation <- temp %>% # semi_join(workset, by = "address") # Add rows removed from validation set back into working set... #removed <- anti_join(temp, validation) #workset <- rbind(workset, removed) # ... Or not validation <- temp # Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (grey) test_index <- createDataPartition(y = workset$grey, times = 1, p = .5, list = FALSE) # Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes #test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE) train_set <- workset[-test_index,] temp <- workset[test_index,] # Make sure addresses in validation set are also in working set.... #test_set <- temp %>% # semi_join(train_set, by = "address") # Add rows removed from validation set back into working set.... #removed <- anti_join(temp, test_set) #train_set <- rbind(train_set, removed) # ....Or not test_set <- temp # Clean up environment rm(dest_file, url, temp) # Inspect data frames #test_set %>% str() #test_set %>% head() #train_set %>% str() #train_set %>% head() ## Data preparation is done, now focusing on Self Organizing Maps as our method