# Calculate 10th percentile of movieId by rating movieId_cutoff <- edx %>% group_by(movieId) %>% summarize(n = n()) %>% .$n %>% quantile(.10) paste("Movies below the 10th percentile with fewer than", as.character(movieId_cutoff), "ratings will be ignored.", sep=" ") #Calculate 10th percentile of userId by rating userId_cutoff <- edx %>% group_by(userId) %>% summarize(n = n()) %>% .$n %>% quantile(.10) paste("Users below the 10th percentile with fewer than", as.character(userId_cutoff), "ratings will be ignored.", sep=" ") # Remove any movie below the 10th percentile in terms of ratins edx2 <- edx %>% group_by(movieId) %>% filter(n()>=movieId_cutoff) %>% ungroup() # Remove any user below the 10th percentile in terms of ratings edx2 <- edx2 %>% group_by(userId) %>% filter(n()>=userId_cutoff) %>% ungroup() # Create partition, reserving 20% of edx set for testing purposes test_index <- createDataPartition(y = edx2$rating, times = 1, p = 0.2, list = FALSE) train_set <- edx2[-test_index,] test_set <- edx2[test_index,] # Define loss function, and throw away any NA values that result from RMSE <- function(true_ratings, predicted_ratings){ sqrt(mean((true_ratings - predicted_ratings)^2, na.rm=TRUE))} # Calcualte the average rating for the training set. mu_hat <- mean(train_set$rating) paste("Penalized User and Movie Effect Least Squares approach with a sample mean rating of", as.character(mu_hat), ", optimizing lambda to the nearest integer.", sep=" ") # Using Movie and User Effects with Penalization on both, optimize lambda to the nearest integer. lambdas <- seq(1, 10, 1) RMSEs <- sapply(lambdas, function(l){ b_i <- train_set %>% group_by(movieId) %>% summarize(b_i = sum(rating - mu_hat)/(n()+l)) b_u <- train_set %>% left_join(b_i, by="movieId") %>% group_by(userId) %>% summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l)) predicted_ratings <- test_set %>% left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId") %>% mutate(pred = mu_hat + b_i + b_u) %>% pull(pred) return(RMSE(predicted_ratings, test_set$rating)) }) lambda_best <- lambdas[which.min(RMSEs)] paste("Penalized LSE Model with lambda =", as.character(lambda_best), "gives an RMSE of", as.character(min(RMSEs)), "on the test set.", sep=" ") # Use this optimized lambda value to test against the validation set. paste("Now testing this optimized lambda against the vaildation set.") l <- lambda_best b_i <- train_set %>% group_by(movieId) %>% summarize(b_i = sum(rating - mu_hat)/(n()+l)) b_u <- train_set %>% left_join(b_i, by="movieId") %>% group_by(userId) %>% summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l)) final_predicted_ratings <- validation %>% left_join(b_i, by = "movieId") %>% left_join(b_u, by = "userId") %>% mutate(pred = mu_hat + b_i + b_u) %>% pull(pred) final_rmse <- RMSE(final_predicted_ratings, validation$rating) paste("Penalized LSE Model with optimized lambda =", as.character(lambda_best), "gives an RMSE of", as.character(final_rmse), "on the validation set.", sep=" ")