ransomware/scratch/Visuals.R

101 lines
3.2 KiB
R

# Do some graphical exploration before applying any models.
# Look at the example work for some ideas.
# Add any compelling visuals as needed here.
# ?? Cluster graphs go at the end.
# Install foreach package if needed
if(!require(matrixStats)) install.packages("matrixStats")
# Load foreach library
library(matrixStats)
## Principle Component Analysis
names(ransomware)
str(ransomware)
#Sample every nth row due to memory constraints
train_samp <- train_set[seq(1, nrow(train_set), 100), ]
# What percentage of sample is ransomware?
mean(train_samp$bw=="black")
# Keep only numeric columns
train_num <- train_samp %>% select(year, day, length, weight, count, looped, neighbors, income)
# Keep only numeric columns
train_scaled <- train_num %>% scale()
# Histograms of each of the columns to show skewness
train_num$year %>% hist(main = paste("Histogram of","year"))
train_num$day %>% hist(main = paste("Histogram of","day"))
train_num$length %>% hist(main = paste("Histogram of","length"))
train_num$weight %>% hist(main = paste("Histogram of","weight"))
train_num$count %>% hist(main = paste("Histogram of","count"))
train_num$looped %>% hist(main = paste("Histogram of","looped"))
train_num$neighbors %>% hist(main = paste("Histogram of","neighbors"))
train_num$income %>% hist(main = paste("Histogram of","income"))
# Check for variability across numerical columns using coefficients of variation
sds <- train_num %>% as.matrix() %>% colSds()
means <- train_num %>% as.matrix() %>% colMeans()
coeff_vars <- sds %/% means
plot(coeff_vars)
coeff_vars
# View distances between points of a sample to look for patterns
x <- train_scaled %>% as.matrix()
d <- dist(x)
image(as.matrix(d), col = rev(RColorBrewer::brewer.pal(9, "RdBu"))) # Change colors or Orange/Blue
# Principal Component Analysis
pca <- prcomp(train_scaled)
pca
summary(pca)
pc <- 1:ncol(train_scaled)
qplot(pc, pca$sdev)
# Plot the first two PCs with color representing black/white
data.frame(pca$x[,1:2], bw=train_samp$bw) %>%
sample_n(200) %>%
ggplot(aes(PC1,PC2, fill = bw))+
geom_point(cex=3, pch=21) +
coord_fixed(ratio = 1)
# First two dimensions do NOT preserve distance very well
#d_approx <- dist(pca$x[, 1:2])
#qplot(d, d_approx) + geom_abline(color="red")
########################################################
## Histograms of each of the columns to show skewness
## Plot histograms for each column using facet wrap
########################################################
train_long <- train_num %>% # Apply pivot_longer function
pivot_longer(colnames(train_num)) %>%
as.data.frame()
# Histograms per column
ggp1 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram
geom_histogram(aes(y = ..density..), bins=20) +
geom_density(col = "green", size = .5) +
facet_wrap(~ name, scales = "free")
ggp1
# Log scale on value axis, does not make sense for temporal variables
ggp2 <- ggplot(train_long, aes(x = value)) + # Draw each column as histogram
geom_histogram(aes(y = ..density..), bins=20) +
geom_density(col = "green", size = .5) +
scale_x_continuous(trans='log2') +
facet_wrap(~ name, scales = "free")
ggp2 + theme(axis.text.x = element_text(size = 8))
# Clean up environment
rm(pca, x, coeff_vars, d, means, pc, sds)