Visualizations are finally in place and looking good enough to show others. Now all that is needed is to fill in the text and clean up the flow. One or two more sessions, and then I can sent out reading drafts.

This commit is contained in:
shelldweller 2021-11-02 11:34:50 -06:00
parent 310cb800f8
commit 1c4ef5bb15
5 changed files with 289 additions and 23 deletions

View File

@ -303,18 +303,268 @@ Now we can compare the relative spread of each feature by calculating the coeffi
```{r cv-results, echo=FALSE, fig.align="center"}
# Summarize results in a table and a plot
# Summarize results in a table
knitr::kable(coeff_vars)
plot(coeff_vars)
# Scatterplot, not very interesting
# plot(coeff_vars)
```
From this, it appears that `r selected_features[1]` has the widest range of variability, followed by `r selected_features[2]`. These are also the features that are most strongly skewed to the right, meaning that a few addresses have really high values for each of these features while the bulk of the data set has very low values for these numbers.
Now do the following (after filling in methods, results, and conclusions, since those are done already):
!) Break into groups somehow. Graph variables per group? Show how the variables are distributed for each ransomware group? Percent ransomware per each day of the week, for example. Is ransomware more prevalent on a particular day of the week? Break other numerical values into bins, and graph percentage per bin. Look for trends and correlations between groups/variables, and display them here. MORE OF THIS....
Taking the feature with the highest variation `r selected_features[1]`, let us take a look at the distribution for individual ransomware families. Perhaps there is a similarity across families.
```{r variation histograms, echo=FALSE, fig.show="hold", out.width='35%', warning=FALSE}
# Density plots of the feature with highest variation
selected_feature1 <- selected_features[1]
ransomware_big_families <- ransomware %>%
mutate(selected_feature1 = as.numeric(selected_feature1))
# Note: Putting these graphs into a for loop breaks some of the formatting.
# Low membership makes some of the graphs not very informative
# Relatively boring graphs have been commented out to save time and space.
# These can be uncommented if one wishes.
# Label 1
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[1]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[1]) +
scale_x_continuous(trans='log2')
# Label 2
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[2]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green", size = .5)+
# ggtitle(levels(ransomware_big_families$label)[2]) +
# scale_x_continuous(trans='log2')
# Label 3
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[3]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green")+
# ggtitle(levels(ransomware_big_families$label)[3]) +
# scale_x_continuous(trans='log2')
# Label 4
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[4]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[4]) +
scale_x_continuous(trans='log2')
# Label 5
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[5]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[5]) +
scale_x_continuous(trans='log2')
# Label 6
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[6]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[6]) +
scale_x_continuous(trans='log2')
# Label 7
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[7]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[7]) +
scale_x_continuous(trans='log2')
# Label 8
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[8]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[8]) +
scale_x_continuous(trans='log2')
# Label 9
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[9]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green")+
# ggtitle(levels(ransomware_big_families$label)[9]) +
# scale_x_continuous(trans='log2')
# Label 10
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[10]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[10]) +
scale_x_continuous(trans='log2')
# Label 11
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[11]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[11]) +
scale_x_continuous(trans='log2')
# Label 12
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[12]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[12]) +
scale_x_continuous(trans='log2')
# Label 13
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[13]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[13]) +
scale_x_continuous(trans='log2')
# Label 14
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[14]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[14]) +
scale_x_continuous(trans='log2')
# Label 15
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[15]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[15]) +
scale_x_continuous(trans='log2')
# Label 16
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[16]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[16]) +
scale_x_continuous(trans='log2')
# Label 17
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[17]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green", size = .5)+
# ggtitle(levels(ransomware_big_families$label)[17]) +
# scale_x_continuous(trans='log2')
# Label 18
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[18]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[18]) +
scale_x_continuous(trans='log2')
# Label 19
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[19]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green")+
# ggtitle(levels(ransomware_big_families$label)[19]) +
# scale_x_continuous(trans='log2')
# Label 20
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[20]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[20]) +
scale_x_continuous(trans='log2')
# Label 21
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[21]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green", size = .5)+
# ggtitle(levels(ransomware_big_families$label)[21]) +
# scale_x_continuous(trans='log2')
# Label 22
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[22]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[22]) +
scale_x_continuous(trans='log2')
# Label 23
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[23]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[23]) +
scale_x_continuous(trans='log2')
# Label 24
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[24]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[24]) +
scale_x_continuous(trans='log2')
# Label 25
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[25]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green", size = .5)+
# ggtitle(levels(ransomware_big_families$label)[25]) +
# scale_x_continuous(trans='log2')
# Label 26
#ransomware_big_families %>%
# filter(label==levels(ransomware_big_families$label)[26]) %>%
# select(income) %>%
# ggplot(aes(x=income, y = ..density..)) +
# geom_density(col = "green")+
# ggtitle(levels(ransomware_big_families$label)[26]) +
# scale_x_continuous(trans='log2')
# Label 27
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[27]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[27]) +
scale_x_continuous(trans='log2')
# Label 28
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[28]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[28]) +
scale_x_continuous(trans='log2')
# Label 29
ransomware_big_families %>%
filter(label==levels(ransomware_big_families$label)[29]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) +
geom_density(col = "green")+
ggtitle(levels(ransomware_big_families$label)[29]) +
scale_x_continuous(trans='log2')
```
```{r shrimp-percentage, echo=FALSE, include=FALSE}
@ -681,55 +931,55 @@ cm_labels$byClass %>% knitr::kable()
Here are some graphs, tell a bit more about them.
```{r binary som graphs, echo=FALSE, fig.align="center"}
```{r binary som graphs, echo=FALSE, fig.show="hold", out.width='35%'}
# Be careful with these, some are really large and take a long time to produce.
# Visualize neural network mapping
plot(som_model2, type = 'mapping', pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Distance map
plot(som_model2, type = 'quality', pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize counts
plot(som_model2, type = 'counts', pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize fan diagram
plot(som_model2, type = 'codes', pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 1
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,1],
main=colnames(train_num)[1], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 2
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,2],
main=colnames(train_num)[2], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 3
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,3],
main=colnames(train_num)[3], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 4
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,4],
main=colnames(train_num)[4], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 5
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,5],
main=colnames(train_num)[5], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
# Visualize heatmap for variable 6
plot(som_model2, type = 'property', property = som_model2$codes[[1]][,6],
main=colnames(train_num)[6], pch = 19, palette.name = topo.colors)
cat(" \n")
#cat(" \n")
```

Binary file not shown.

View File

@ -1,8 +1,24 @@
for(i in 1:30){
print(
ransomware %>% filter(label==levels(ransomware$label)[i]) %>%
#i <- 1
ransomware_big_families <- ransomware %>%
group_by(label) %>% filter(n() > 1) %>%
ungroup()
ransomware_big_families <- ransomware_big_families %>% mutate(income = as.numeric(income))
for(i in 1:length(levels(ransomware_big_families$label))){
ggp <-
ransomware_big_families %>% filter(label==levels(ransomware_big_families$label)[i]) %>%
select(income) %>%
ggplot(aes(x=income, y = ..density..)) + geom_histogram(bins="30") +
geom_density(col = "green", size = .5)
)
}
ggplot(aes(x=income, y = ..density..)) +
# geom_histogram(bins="30") +
# geom_bar() +
geom_density(col = "green", size = .5)+
ggtitle(levels(ransomware_big_families$label)[i]) +
scale_x_continuous(trans='log2')
print(ggp)
print(i)
print(levels(ransomware_big_families$label)[i])
}