ransomware/scratch/RanFor.R

# Try Random Forest as Method #1, as suggested by original paper.

# Install foreach package if needed
if(!require(randomForest)) install.packages("randomForest")
library(randomForest)

# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)

# Binary outputs, black=ransomware, white=non-ransomware, train set
train_bw <- train_samp$bw

#Sample every nth row due to memory constraints
set.seed(5)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]

# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)

# Same for test set
test_bw <- test_samp$bw

# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))

# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)

# Check for best tuning parameters
ggplot(rf_model)
rf_model$bestTune

# Fit model
fit_rf <- randomForest(train_samp, train_bw,
                       minNode = rf_model$bestTune$mtry)

# Check for enough trees
plot(fit_rf)

# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
cm$overall["Accuracy"]
cm

# Measure accuracy of model against full validation set

y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$bw)
cm$overall["Accuracy"]
cm

# Clean up environment
rm(cm, control, fit_rf, grid, y_hat_rf)