movielens/movielens-recommendation-sy...

123 lines
3.2 KiB
R

# Calculate 10th percentile of movieId by rating
movieId_cutoff <-
edx %>%
group_by(movieId) %>%
summarize(n = n()) %>%
.$n %>%
quantile(.10)
paste("Movies below the 10th percentile with fewer than",
as.character(movieId_cutoff),
"ratings will be ignored.",
sep=" ")
#Calculate 10th percentile of userId by rating
userId_cutoff <-
edx %>%
group_by(userId) %>%
summarize(n = n()) %>%
.$n %>%
quantile(.10)
paste("Users below the 10th percentile with fewer than",
as.character(userId_cutoff),
"ratings will be ignored.",
sep=" ")
# Remove any movie below the 10th percentile in terms of ratins
edx2 <-
edx %>%
group_by(movieId) %>%
filter(n()>=movieId_cutoff) %>%
ungroup()
# Remove any user below the 10th percentile in terms of ratings
edx2 <-
edx2 %>%
group_by(userId) %>%
filter(n()>=userId_cutoff) %>%
ungroup()
# Create partition, reserving 20% of edx set for testing purposes
test_index <-
createDataPartition(y = edx2$rating,
times = 1, p = 0.2, list = FALSE)
train_set <- edx2[-test_index,]
test_set <- edx2[test_index,]
# Define loss function, and throw away any NA values that result from
RMSE <-
function(true_ratings, predicted_ratings){
sqrt(mean((true_ratings - predicted_ratings)^2, na.rm=TRUE))}
# Calcualte the average rating for the training set.
mu_hat <- mean(train_set$rating)
paste("Penalized User and Movie Effect Least Squares approach with a sample mean rating of",
as.character(mu_hat),
", optimizing lambda to the nearest integer.",
sep=" ")
# Using Movie and User Effects with Penalization on both, optimize lambda to the nearest integer.
lambdas <- seq(1, 10, 1)
RMSEs <- sapply(lambdas, function(l){
b_i <-
train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu_hat)/(n()+l))
b_u <-
train_set %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))
predicted_ratings <-
test_set %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu_hat + b_i + b_u) %>%
pull(pred)
return(RMSE(predicted_ratings, test_set$rating))
})
lambda_best <- lambdas[which.min(RMSEs)]
paste("Penalized LSE Model with lambda =",
as.character(lambda_best),
"gives an RMSE of",
as.character(min(RMSEs)),
"on the test set.",
sep=" ")
# Use this optimized lambda value to test against the validation set.
paste("Now testing this optimized lambda against the vaildation set.")
l <- lambda_best
b_i <- train_set %>%
group_by(movieId) %>%
summarize(b_i = sum(rating - mu_hat)/(n()+l))
b_u <- train_set %>%
left_join(b_i, by="movieId") %>%
group_by(userId) %>%
summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))
final_predicted_ratings <-
validation %>%
left_join(b_i, by = "movieId") %>%
left_join(b_u, by = "userId") %>%
mutate(pred = mu_hat + b_i + b_u) %>%
pull(pred)
final_rmse <-
RMSE(final_predicted_ratings, validation$rating)
paste("Penalized LSE Model with optimized lambda =",
as.character(lambda_best),
"gives an RMSE of",
as.character(final_rmse),
"on the validation set.",
sep=" ")