# Do some graphical exploration before applying any models. # Look at the example work for some ideas. # Add any compelling visuals as needed here. # ?? Cluster graphs go at the end. # Install foreach package if needed if(!require(matrixStats)) install.packages("matrixStats") # Load foreach library library(matrixStats) ## Principle Component Analysis names(ransomware) str(ransomware) #Sample every nth row due to memory constraints train_samp <- train_set[seq(1, nrow(train_set), 100), ] # What percentage of sample is ransomware? mean(train_samp$bw=="black") # Keep only numeric columns train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income) # Keep only numeric columns train_scaled <- train_num %>% scale() # Histograms of each of the columns to show skewness train_num$year %>% hist(main = paste("Histogram of","year")) train_num$day %>% hist(main = paste("Histogram of","day")) train_num$length %>% hist(main = paste("Histogram of","length")) train_num$weight %>% hist(main = paste("Histogram of","weight")) train_num$count %>% hist(main = paste("Histogram of","count")) train_num$looped %>% hist(main = paste("Histogram of","looped")) train_num$neighbors %>% hist(main = paste("Histogram of","neighbors")) train_num$income %>% hist(main = paste("Histogram of","income")) # Check for variability across numerical columns using coefficients of variation sds <- train_num %>% as.matrix() %>% colSds() means <- train_num %>% as.matrix() %>% colMeans() coeff_vars <- sds %/% means plot(coeff_vars) coeff_vars # View distances between points of a sample to look for patterns x <- train_scaled %>% as.matrix() d <- dist(x) image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue # Principal Component Analysis pca <- prcomp(train_scaled) pca summary(pca) pc <- 1:ncol(train_scaled) qplot(pc, pca$sdev) # Plot the first two PCs with color representing black/white data.frame(pca$x[,1:2], bw=train_samp$bw) %>% sample_n(200) %>% ggplot(aes(PC1,PC2, fill = bw))+ geom_point(cex=3, pch=21) + coord_fixed(ratio = 1) # First two dimensions do NOT preserve distance very well #d_approx <- dist(pca$x[, 1:2]) #qplot(d, d_approx) + geom_abline(color="red") # Clean up environment rm(pca, x, coeff_vars, d, means, pc, sds)