diff --git a/Detecting_Bitcoin_Ransomware.R b/Detecting_Bitcoin_Ransomware.R
new file mode 100644
index 0000000..f4f81e1
--- /dev/null
+++ b/Detecting_Bitcoin_Ransomware.R
@@ -0,0 +1,242 @@
+###############################################################################
+## Ransomware Detection on the Bitcoin Blockchain
+## using Random Forests and Self Organizing Maps
+##
+## Kaylee Robert Tejeda
+## October 31, 2021
+##
+## Submitted as part of final CYO project for
+## HarvardX PH125.9x Capstone Course
+## 
+###############################################################################
+
+# Timer command, uncomment following lines to time script
+library(tictoc)
+tic(quiet = FALSE)
+
+# Install necessary packages if not already present
+if(!require(tidyverse)) install.packages("tidyverse")
+if(!require(caret)) install.packages("caret")
+if(!require(randomForest)) install.packages("randomForest")
+if(!require(kohonen)) install.packages("kohonen")
+if(!require(parallel)) install.packages("parallel")
+if(!require(matrixStats)) install.packages("matrixStats")
+
+# Load Libraries
+library(tidyverse)
+library(caret)
+library(randomForest)
+library(kohonen)
+library(parallel)
+library(matrixStats)
+
+# Download data
+url <- 
+  "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
+dest_file <- "data/data.zip"
+if(!dir.exists("data"))dir.create("data")
+if(!file.exists(dest_file))download.file(url, destfile = dest_file)
+
+# Unzip into CSV
+if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, 
+                                                   "BitcoinHeistData.csv", 
+                                                   exdir="data")
+
+# Import data from CSV
+ransomware <- read_csv("data/BitcoinHeistData.csv")
+
+# Turn labels into factors, "bw" is binary factor for ransomware/non-ransomware
+ransomware <- ransomware %>%
+  mutate(label=as.factor(label), 
+         bw=as.factor(ifelse(label=="white", "white", "black")))
+
+# Validation set made from 50% of BitcoinHeist data
+test_index <- createDataPartition(y = ransomware$bw, 
+                                  times = 1, p = .5, list = FALSE)
+
+workset <- ransomware[-test_index,]
+validation <- ransomware[test_index,]
+
+# Split the working set into a training set and a test set @ 50%
+test_index <- createDataPartition(y = workset$bw,
+                                  times = 1, p = .5, list = FALSE)
+
+train_set <- workset[-test_index,]
+test_set <- workset[test_index,]
+
+###############################################################################
+## Data preparation is now done
+## Separate into "black" and "white" groups using Random Forests predictions
+###############################################################################
+
+# Keep only numeric columns, ignoring temporal features
+ransomware_num <- ransomware %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# Check for variation across numerical columns using coefficients of variation
+#
+# Calculate standard deviations for each column
+sds <- ransomware_num %>% as.matrix() %>% colSds()
+
+# Calculate means for each column
+means <- ransomware_num %>% as.matrix() %>% colMeans()
+
+# Calculate CVs for each column
+coeff_vars <- sds %/% means
+
+#  Select the two features with the highest coefficients of variation
+selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
+
+message("The features with the highest coefficients of variation are ", 
+        selected_features[1], selected_features[2], 
+        ", which will be used to train the binary model.")
+
+#Sample every 100th row due to memory constraints
+train_samp <- train_set[seq(1, nrow(train_set), 100), ]
+
+# Keep only numeric columns with highest coefficients of variation
+train_num <- train_samp %>% select(selected_features[1], selected_features[2])
+
+# Binary outputs, black = ransomware, white = non-ransomware, train set
+train_bw <- train_samp$bw 
+
+#Sample every nth row due to memory constraints
+test_samp <- test_set[seq(1, nrow(train_set), 100), ]
+
+# Dimension reduction again
+test_num <- test_samp %>% select(selected_features[1], selected_features[2])
+
+# Same for test set
+test_bw <- test_samp$bw 
+
+# Lower CV numbers
+control <- trainControl(method="cv", number = 10)
+grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
+
+# Train Random Forests model 
+rf_model <- train(train_num, train_bw, method="rf", 
+                  trControl = control, tuneGrid=grid)
+
+# Fit model
+fit_rf <- randomForest(train_samp, train_bw,
+                       minNode = rf_model$bestTune$mtry)
+
+# Measure accuracy of model against test sample
+y_hat_rf <- predict(fit_rf, test_samp)
+cm <- confusionMatrix(y_hat_rf, test_bw)
+message("Overall accuracy for the binary separation is ",
+        cm$overall["Accuracy"])
+cm
+
+##############################################################################
+## Now we use the Random Forest model to exclude the "white" addresses from
+## the full ransomware set, to categorize the "black" addresses into families.
+message("Now we further categorize black address into ransomware families.")
+##############################################################################
+
+# Measure accuracy of model against full ransomware set
+ransomware_y_hat_rf <- predict(fit_rf, ransomware)
+cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
+message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
+cm
+
+# Now use this prediction to reduce the original set to only "black" addresses
+# First append the full set of predictions to the original set.
+ransomware$predictions <- ransomware_y_hat_rf
+
+# Filter out all the predicted "white" addresses, 
+# leaving only predicted "black" addresses
+black_addresses <- ransomware %>% filter(predictions=="black")
+
+# Split the reduced black-predictions into a training set and a test set @ 50%
+test_index <- createDataPartition(y = black_addresses$predictions,
+                                  times = 1, p = .5, list = FALSE)
+
+train_set <- black_addresses[-test_index,]
+test_set <- black_addresses[test_index,]
+
+
+# Keep only numeric columns, ignoring temporal variables.
+train_num <- train_set %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# SOM function can only work on matrices
+train_mat <- as.matrix(scale(train_num))
+
+# Switching to supervised SOMs
+test_num <- test_set %>% 
+  select(length, weight, count, looped, neighbors, income)
+
+# Testing data is scaled according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, 
+                            center = attr(train_mat, "scaled:center"),
+                            scale = attr(train_mat, "scaled:scale")))
+
+# Categorical
+train_label <- train_set$label %>% classvec2classmat()
+
+# Same for test set
+test_label <- test_set$label %>% classvec2classmat()
+
+# Create Data list for supervised SOM
+train_list <- list(independent = train_mat, dependent = train_label)
+
+# Calculate idea grid size according to:
+# https://www.researchgate.net/post/How-many-nodes-for-self-organizing-maps
+
+# Formulaic method 1
+grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
+
+# Based on categorical number, method 2
+#grid_size = ceiling(sqrt(length(unique(ransomware$label))))
+
+grid_size
+
+# Create SOM grid
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, 
+                      topo="hexagonal", toroidal = TRUE)
+
+## Now build the model.
+som_model2 <- xyf(train_mat, train_label,
+                  grid = train_grid, 
+                  rlen = 100,
+                  mode="pbatch", 
+                  cores = detectCores(), 
+                  keep.data = TRUE
+)
+
+# Now test predictions of test set
+
+test_list <- list(independent = test_mat, dependent = test_label)
+
+ransomware_group.prediction <- predict(som_model2, newdata = test_list)
+table(test_set$label, ransomware_group.prediction$prediction[[2]])
+
+# Confusion Matrix
+cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
+                             test_set$label)
+message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
+cm_labels
+
+
+#############################################################################
+## K-Means Clustering to visualize the categorization of the SOM
+## https://www.polarmicrobes.org/microbial-community-segmentation-with-r/
+#############################################################################
+
+# Set number of clusters to be equal to number of known ransomware groups
+n_groups <- length(unique(ransomware$label)) - 1
+n_groups
+
+som.cluster <- kmeans(data.frame(som_model2$codes[[1]]), centers=n_groups)
+
+plot(som_model2,
+     main = 'K-Means Clustering',
+     type = "property",
+     property = som.cluster$cluster,
+     palette.name = topo.colors)
+add.cluster.boundaries(som_model2, som.cluster$cluster)
+
+#End timer
+toc()
+
diff --git a/Final_method.R b/Final_method.R
index 0449f59..43b00a8 100644
--- a/Final_method.R
+++ b/Final_method.R
@@ -4,8 +4,10 @@
 ##
 ## Kaylee Robert Tejeda
 ## October 31, 2021
+##
+## Submitted as part of final CYO project for
+## HarvardX PH125.9x Capstone Course
 ## 
-## Make this header better!!!!
 #################################################
 
 # Timer command, uncomment following lines to time script
@@ -18,6 +20,7 @@ if(!require(caret)) install.packages("caret")
 if(!require(randomForest)) install.packages("randomForest")
 if(!require(kohonen)) install.packages("kohonen")
 if(!require(parallel)) install.packages("parallel")
+if(!require(matrixStats)) install.packages("matrixStats")
 
 # Load Libraries
 library(tidyverse)
@@ -25,53 +28,80 @@ library(caret)
 library(randomForest)
 library(kohonen)
 library(parallel)
+library(matrixStats)
 
 # Download data
-url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
+url <- 
+  "https://archive.ics.uci.edu/ml/machine-learning-databases/00526/data.zip"
 dest_file <- "data/data.zip"
 if(!dir.exists("data"))dir.create("data")
 if(!file.exists(dest_file))download.file(url, destfile = dest_file)
 
 # Unzip into CSV
-if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, "BitcoinHeistData.csv", exdir="data")
+if(!file.exists("data/BitcoinHeistData.csv"))unzip(dest_file, 
+                                                   "BitcoinHeistData.csv", 
+                                                   exdir="data")
 
 # Import data from CSV
 ransomware <- read_csv("data/BitcoinHeistData.csv")
 
 # Turn labels into factors, bw is a binary factor for ransomware/non-ransomware
-ransomware <- ransomware %>% mutate(label=as.factor(label), bw=as.factor(ifelse(label=="white", "white", "black")))
+ransomware <- ransomware %>%
+  mutate(label=as.factor(label), 
+         bw=as.factor(ifelse(label=="white", "white", "black")))
 
-# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Binary outcomes (bw)
-test_index <- createDataPartition(y = ransomware$bw, times = 1, p = .5, list = FALSE)
+# Validation set made from 50% of BitcoinHeist data
+test_index <- createDataPartition(y = ransomware$bw, 
+                                  times = 1, p = .5, list = FALSE)
 
 workset <- ransomware[-test_index,]
 validation <- ransomware[test_index,]
 
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
-test_index <- createDataPartition(y = workset$bw, times = 1, p = .5, list = FALSE)
+# Split the working set into a training set and a test set @ 50%,
+test_index <- createDataPartition(y = workset$bw,
+                                  times = 1, p = .5, list = FALSE)
 
 train_set <- workset[-test_index,]
 test_set <- workset[test_index,]
 
-# Separate into Black and White groups using Random Forests
+## Separate into "black" and "white" groups using Random Forests
 
-message("First to separate in to black and white groups.")
+# Keep only numeric columns, ignoring temporal features
+ransomware_num <- ransomware %>% 
+  select(length, weight, count, looped, neighbors, income)
 
-#Sample every nth row due to memory constraints
+# Check for variation across numerical columns using coefficients of variation
+
+# Calculate standard deviations for each column
+sds <- ransomware_num %>% as.matrix() %>% colSds()
+
+# Calculate means for each column
+means <- ransomware_num %>% as.matrix() %>% colMeans()
+
+# Calculate CVs for each column
+coeff_vars <- sds %/% means
+
+#  Select the two features with the highest coefficients of variation
+selected_features <- names(sort(coeff_vars, decreasing=TRUE))[1:2]
+
+message("The features with the highest coefficients of variation are ", 
+        selected_features[1], selected_features[2], 
+        ", which will be used to train the binary model.")
+
+#Sample every 100th row due to memory constraints
 train_samp <- train_set[seq(1, nrow(train_set), 100), ]
 
-# Keep only numeric columns with highest coefficients of variation for dimension reduction
-train_num <- train_samp %>% select(neighbors, income)
+# Keep only numeric columns with highest coefficients of variation
+train_num <- train_samp %>% select(selected_features[1], selected_features[2])
 
-# Binary outputs, black=ransomware, white=non-ransomware, train set
+# Binary outputs, black = ransomware, white = non-ransomware, train set
 train_bw <- train_samp$bw 
 
 #Sample every nth row due to memory constraints
-set.seed(23)
 test_samp <- test_set[seq(1, nrow(train_set), 100), ]
 
 # Dimension reduction again
-test_num <- test_samp %>% select(neighbors, income)
+test_num <- test_samp %>% select(selected_features[1], selected_features[2])
 
 # Same for test set
 test_bw <- test_samp$bw 
@@ -81,7 +111,8 @@ control <- trainControl(method="cv", number = 10)
 grid <- data.frame(mtry = c(2, 4, 6, 8, 10, 12))
 
 # Train Random Forests model 
-rf_model <- train(train_num, train_bw, method="rf", trControl = control, tuneGrid=grid)
+rf_model <- train(train_num, train_bw, method="rf", 
+                  trControl = control, tuneGrid=grid)
 
 # Fit model
 fit_rf <- randomForest(train_samp, train_bw,
@@ -90,81 +121,50 @@ fit_rf <- randomForest(train_samp, train_bw,
 # Measure accuracy of model against test sample
 y_hat_rf <- predict(fit_rf, test_samp)
 cm <- confusionMatrix(y_hat_rf, test_bw)
-message("Overall accuracy for the test set is ", cm$overall["Accuracy"])
+message("Overall accuracy for the binary separation is ",
+        cm$overall["Accuracy"])
 cm
 
-# Measure accuracy of model against full validation set
-
-y_hat_rf <- predict(fit_rf, validation)
-cm <- confusionMatrix(y_hat_rf, validation$bw)
-message("Overall accuracy for the validation set is ", cm$overall["Accuracy"])
-cm
 
 # From here, trim down set to ONLY the black addresses and apply SOMs...
 
 message("Now we further categorize black address into ransomware families.")
 
-# Try categorical SOMs on black-only addresses....
-#!! This is NOT right, is it? 
-#!! It would be even MORE impressive if I removed all the PREDICTED whites from 
-#!! the test set instead and started there.
+# Measure accuracy of model against full ransomware set
+ransomware_y_hat_rf <- predict(fit_rf, ransomware)
+cm <- confusionMatrix(ransomware_y_hat_rf, ransomware$bw)
+message("Overall accuracy for the full data set is ", cm$overall["Accuracy"])
+cm
 
-blacks <- ransomware %>% filter(!label=="white")
+# Now use this prediction to reduce the original set to only "black" addresses
 
-# Validation set made from 50% of BitcoinHeist data, reduce later if possible. Categorical outcomes
-set.seed(23)
-test_index <- createDataPartition(y = blacks$label, times = 1, p = .5, list = FALSE)
+ransomware$predictions <- ransomware_y_hat_rf
 
-workset_blacks <- blacks[-test_index,]
-temp <- blacks[test_index,]
+black_addresses <- ransomware %>% filter(predictions=="black")
 
-# Make sure addresses in validation set are also in working set...
-# validation <- temp %>% 
-#  semi_join(workset, by = "address")
+# Split the reduced black-predictions into a training set and a test set @ 50%
+test_index <- createDataPartition(y = black_addresses$predictions,
+                                  times = 1, p = .5, list = FALSE)
 
-# Add rows removed from validation set back into working set...
-#removed <- anti_join(temp, validation)
-#workset <- rbind(workset, removed)
+train_set <- workset[-test_index,]
+test_set <- workset[test_index,]
 
-# ... Or not
-validation_blacks <- temp
 
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Binary outcomes (bw)
-set.seed(5)
-test_index <- createDataPartition(y = workset_blacks$label, times = 1, p = .5, list = FALSE)
-
-# Split the working set into a training set and a test set @ 50%, reduce later if possible. Categorical outcomes
-#test_index <- createDataPartition(y = workset$label, times = 1, p = .5, list = FALSE)
-
-train_set <- workset_blacks[-test_index,]
-temp <- workset_blacks[test_index,]
-
-# Make sure addresses in validation set are also in working set....
-#test_set <- temp %>% 
-#  semi_join(train_set, by = "address")
-
-# Add rows removed from validation set back into working set....
-#removed <- anti_join(temp, test_set)
-#train_set <- rbind(train_set, removed)
-
-# ....Or not
-test_set <- temp
-
-##!! Data preparation is done, now focusing on Self Organizing Maps as our method
-##!! Start here after reworking the data prep steps above.
-
-# Keep only numeric columns, ignoring dates and looped for now (insert factor analysis impVar here?)
-train_num <- train_set %>% select(length, weight, count, neighbors, income)
+# Keep only numeric columns, ignoring temporal variables.
+train_num <- train_set %>% 
+  select(length, weight, count, looped, neighbors, income)
 
 # SOM function can only work on matrices
 train_mat <- as.matrix(scale(train_num))
 
 # Switching to supervised SOMs
-test_num <- test_set %>% select(length, weight, count, neighbors, income)
+test_num <- test_set %>% 
+  select(length, weight, count, looped, neighbors, income)
 
-# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
-test_mat <- as.matrix(scale(test_num, center = attr(train_mat, 
-                                                    "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+# Testing data is scaled according to how we scaled our training data.
+test_mat <- as.matrix(scale(test_num, 
+                            center = attr(train_mat, "scaled:center"),
+                            scale = attr(train_mat, "scaled:scale")))
 
 # Categorical
 train_label <- train_set$label %>% classvec2classmat()
@@ -186,17 +186,15 @@ grid_size <- round(sqrt(5*sqrt(nrow(train_set))))
 grid_size
 
 # Create SOM grid
-train_grid <- somgrid(xdim=grid_size, ydim=grid_size, topo="hexagonal", toroidal = TRUE)
-
-# Set magic seed number
-set.seed(23)
+train_grid <- somgrid(xdim=grid_size, ydim=grid_size, 
+                      topo="hexagonal", toroidal = TRUE)
 
 ## Now build the model.
 som_model2 <- xyf(train_mat, train_label,
                   grid = train_grid, 
                   rlen = 100,
-                  mode="pbatch", # or: alpha = c(0.05,0.01),
-                  cores = detectCores(), # detectCores() - 1 if system locks during calculation
+                  mode="pbatch", 
+                  cores = detectCores(), 
                   keep.data = TRUE
 )
 
@@ -209,32 +207,40 @@ ransomware_group.prediction <- predict(som_model2, newdata = test_list)
 #table(test_set$label, ransomware_group.prediction$prediction[[2]])
 
 # Confusion Matrix
-cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]], test_set$label)
+cm_labels <- confusionMatrix(ransomware_group.prediction$prediction[[2]],
+                             test_set$label)
 message("Overall accuracy for the test set is ", cm_labels$overall["Accuracy"])
 #cm_labels
 
 # Now test predictions of validation set
 
 # Switching to supervised SOMs
-valid_num <- validation_blacks %>% select(length, weight, count, neighbors, income)
+valid_num <- validation_blacks %>%
+  select(length, weight, count, looped, neighbors, income)
 
-# Note that when we rescale our testing data we need to scale it according to how we scaled our training data.
-valid_mat <- as.matrix(scale(valid_num, center = attr(train_mat, 
-                                                      "scaled:center"), scale = attr(train_mat, "scaled:scale")))
+# Validation data is scaled using center and spread of training data.
+valid_mat <- as.matrix(scale(valid_num, 
+                             center = attr(train_mat, "scaled:center"), 
+                             scale = attr(train_mat, "scaled:scale")))
 
 valid_label <- validation_blacks$label
 
 valid_list <- list(independent = valid_mat, dependent = valid_label)
 
-ransomware_group.prediction.validation <- predict(som_model2, newdata = valid_list)
-#table(validation_blacks$label, ransomware_group.prediction.validation$prediction[[2]])
+ransomware_group.prediction.validation <- predict(som_model2, 
+                                                  newdata = valid_list)
+#table(validation_blacks$label, 
+#ransomware_group.prediction.validation$prediction[[2]])
 
 # Confusion Matrix
-cm_labels.validation <- confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], validation_blacks$label)
-message("Overall accuracy for the validation set is ",cm_labels.validation$overall["Accuracy"])
+cm_labels.validation <- 
+  confusionMatrix(ransomware_group.prediction.validation$prediction[[2]], 
+                  validation_blacks$label)
+message("Overall accuracy for the validation set is ", 
+        cm_labels.validation$overall["Accuracy"])
 #cm_labels.validation
 
-# Set number of clusters to be equal to number of known ransomware groups (ignoring the whites)
+# Set number of clusters to be equal to number of known ransomware groups
 n_groups <- length(unique(ransomware$label)) - 1
 n_groups