56 lines
1.5 KiB
R
56 lines
1.5 KiB
R
# Try Random Forest as Method #1, as suggested by original paper.
|
|
|
|
# Install foreach package if needed
|
|
if(!require(randomForest)) install.packages("randomForest")
|
|
library(randomForest)
|
|
|
|
# Keep only numeric columns with highest coefficients of variation for dimension reduction
|
|
train_num <- train_samp %>% select(neighbors, income)
|
|
|
|
# Binary outputs, black=ransomware, white=non-ransomware, train set
|
|
train_bw <- train_samp$bw
|
|
|
|
#Sample every nth row due to memory constraints
|
|
set.seed(5)
|
|
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
|
|
|
|
# Dimension reduction again
|
|
test_num <- test_samp %>% select(neighbors, income)
|
|
|
|
# Same for test set
|
|
test_bw <- test_samp$bw
|
|
|
|
# Lower CV numbers
|
|
control <- trainControl(method="cv", number = 10)
|
|
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
|
|
|
|
# Train Random Forests model
|
|
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
|
|
|
|
# Check for best tuning parameters
|
|
ggplot(rf_model)
|
|
rf_model$bestTune
|
|
|
|
# Fit model
|
|
fit_rf <- randomForest(train_samp, train_bw,
|
|
minNode = rf_model$bestTune$mtry)
|
|
|
|
# Check for enough trees
|
|
plot(fit_rf)
|
|
|
|
# Measure accuracy of model against test sample
|
|
y_hat_rf <- predict(fit_rf, test_samp)
|
|
cm <- confusionMatrix(y_hat_rf, test_bw)
|
|
cm$overall["Accuracy"]
|
|
cm
|
|
|
|
# Measure accuracy of model against full validation set
|
|
|
|
y_hat_rf <- predict(fit_rf, validation)
|
|
cm <- confusionMatrix(y_hat_rf, validation$bw)
|
|
cm$overall["Accuracy"]
|
|
cm
|
|
|
|
# Clean up environment
|
|
rm(cm, control, fit_rf, grid, y_hat_rf)
|