123 lines
3.2 KiB
R
123 lines
3.2 KiB
R
# Calculate 10th percentile of movieId by rating
|
|
movieId_cutoff <-
|
|
edx %>%
|
|
group_by(movieId) %>%
|
|
summarize(n = n()) %>%
|
|
.$n %>%
|
|
quantile(.10)
|
|
|
|
paste("Movies below the 10th percentile with fewer than",
|
|
as.character(movieId_cutoff),
|
|
"ratings will be ignored.",
|
|
sep=" ")
|
|
|
|
#Calculate 10th percentile of userId by rating
|
|
userId_cutoff <-
|
|
edx %>%
|
|
group_by(userId) %>%
|
|
summarize(n = n()) %>%
|
|
.$n %>%
|
|
quantile(.10)
|
|
|
|
paste("Users below the 10th percentile with fewer than",
|
|
as.character(userId_cutoff),
|
|
"ratings will be ignored.",
|
|
sep=" ")
|
|
|
|
# Remove any movie below the 10th percentile in terms of ratins
|
|
edx2 <-
|
|
edx %>%
|
|
group_by(movieId) %>%
|
|
filter(n()>=movieId_cutoff) %>%
|
|
ungroup()
|
|
|
|
# Remove any user below the 10th percentile in terms of ratings
|
|
edx2 <-
|
|
edx2 %>%
|
|
group_by(userId) %>%
|
|
filter(n()>=userId_cutoff) %>%
|
|
ungroup()
|
|
|
|
# Create partition, reserving 20% of edx set for testing purposes
|
|
test_index <-
|
|
createDataPartition(y = edx2$rating,
|
|
times = 1, p = 0.2, list = FALSE)
|
|
train_set <- edx2[-test_index,]
|
|
test_set <- edx2[test_index,]
|
|
|
|
# Define loss function, and throw away any NA values that result from
|
|
RMSE <-
|
|
function(true_ratings, predicted_ratings){
|
|
sqrt(mean((true_ratings - predicted_ratings)^2, na.rm=TRUE))}
|
|
|
|
# Calcualte the average rating for the training set.
|
|
mu_hat <- mean(train_set$rating)
|
|
|
|
paste("Penalized User and Movie Effect Least Squares approach with a sample mean rating of",
|
|
as.character(mu_hat),
|
|
", optimizing lambda to the nearest integer.",
|
|
sep=" ")
|
|
|
|
# Using Movie and User Effects with Penalization on both, optimize lambda to the nearest integer.
|
|
|
|
lambdas <- seq(1, 10, 1)
|
|
|
|
RMSEs <- sapply(lambdas, function(l){
|
|
b_i <-
|
|
train_set %>%
|
|
group_by(movieId) %>%
|
|
summarize(b_i = sum(rating - mu_hat)/(n()+l))
|
|
|
|
b_u <-
|
|
train_set %>%
|
|
left_join(b_i, by="movieId") %>%
|
|
group_by(userId) %>%
|
|
summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))
|
|
|
|
predicted_ratings <-
|
|
test_set %>%
|
|
left_join(b_i, by = "movieId") %>%
|
|
left_join(b_u, by = "userId") %>%
|
|
mutate(pred = mu_hat + b_i + b_u) %>%
|
|
pull(pred)
|
|
|
|
return(RMSE(predicted_ratings, test_set$rating))
|
|
|
|
})
|
|
|
|
lambda_best <- lambdas[which.min(RMSEs)]
|
|
|
|
paste("Penalized LSE Model with lambda =",
|
|
as.character(lambda_best),
|
|
"gives an RMSE of",
|
|
as.character(min(RMSEs)),
|
|
"on the test set.",
|
|
sep=" ")
|
|
|
|
# Use this optimized lambda value to test against the validation set.
|
|
paste("Now testing this optimized lambda against the vaildation set.")
|
|
|
|
l <- lambda_best
|
|
b_i <- train_set %>%
|
|
group_by(movieId) %>%
|
|
summarize(b_i = sum(rating - mu_hat)/(n()+l))
|
|
b_u <- train_set %>%
|
|
left_join(b_i, by="movieId") %>%
|
|
group_by(userId) %>%
|
|
summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))
|
|
final_predicted_ratings <-
|
|
validation %>%
|
|
left_join(b_i, by = "movieId") %>%
|
|
left_join(b_u, by = "userId") %>%
|
|
mutate(pred = mu_hat + b_i + b_u) %>%
|
|
pull(pred)
|
|
final_rmse <-
|
|
RMSE(final_predicted_ratings, validation$rating)
|
|
|
|
paste("Penalized LSE Model with optimized lambda =",
|
|
as.character(lambda_best),
|
|
"gives an RMSE of",
|
|
as.character(final_rmse),
|
|
"on the validation set.",
|
|
sep=" ")
|