# Try Random Forest as Method #1, as suggested by original paper. # Install foreach package if needed if(!require(randomForest)) install.packages("randomForest") library(randomForest) # Keep only numeric columns with highest coefficients of variation for dimension reduction train_num <- train_samp %>% select(neighbors, income) # Binary outputs, black=ransomware, white=non-ransomware, train set train_bw <- train_samp$bw #Sample every nth row due to memory constraints set.seed(5) test_samp <- test_set[seq(1, nrow(train_set), 100), ] # Dimension reduction again test_num <- test_samp %>% select(neighbors, income) # Same for test set test_bw <- test_samp$bw # Lower CV numbers control <- trainControl(method="cv", number = 10) grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12)) # Train Random Forests model rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid) # Check for best tuning parameters ggplot(rf_model) rf_model$bestTune # Fit model fit_rf <- randomForest(train_samp, train_bw, minNode = rf_model$bestTune$mtry) # Check for enough trees plot(fit_rf) # Measure accuracy of model against test sample y_hat_rf <- predict(fit_rf, test_samp) cm <- confusionMatrix(y_hat_rf, test_bw) cm$overall["Accuracy"] cm # Measure accuracy of model against full validation set y_hat_rf <- predict(fit_rf, validation) cm <- confusionMatrix(y_hat_rf, validation$bw) cm$overall["Accuracy"] cm # Clean up environment rm(cm, control, fit_rf, grid, y_hat_rf)