ransomware/scratch/RanFor.R

56 lines
1.5 KiB
R

# Try Random Forest as Method #1, as suggested by original paper.
# Install foreach package if needed
if(!require(randomForest)) install.packages("randomForest")
library(randomForest)
# Keep only numeric columns with highest coefficients of variation for dimension reduction
train_num <- train_samp %>% select(neighbors, income)
# Binary outputs, black=ransomware, white=non-ransomware, train set
train_bw <- train_samp$bw
#Sample every nth row due to memory constraints
set.seed(5)
test_samp <- test_set[seq(1, nrow(train_set), 100), ]
# Dimension reduction again
test_num <- test_samp %>% select(neighbors, income)
# Same for test set
test_bw <- test_samp$bw
# Lower CV numbers
control <- trainControl(method="cv", number = 10)
grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
# Train Random Forests model
rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
# Check for best tuning parameters
ggplot(rf_model)
rf_model$bestTune
# Fit model
fit_rf <- randomForest(train_samp, train_bw,
minNode = rf_model$bestTune$mtry)
# Check for enough trees
plot(fit_rf)
# Measure accuracy of model against test sample
y_hat_rf <- predict(fit_rf, test_samp)
cm <- confusionMatrix(y_hat_rf, test_bw)
cm$overall["Accuracy"]
cm
# Measure accuracy of model against full validation set
y_hat_rf <- predict(fit_rf, validation)
cm <- confusionMatrix(y_hat_rf, validation$bw)
cm$overall["Accuracy"]
cm
# Clean up environment
rm(cm, control, fit_rf, grid, y_hat_rf)