movielens/movielens-recommendation-sy...

# Calculate 10th percentile of movieId by rating
movieId_cutoff <-
  edx %>%
  group_by(movieId) %>%
  summarize(n = n()) %>%
  .$n %>%
  quantile(.10)

paste("Movies below the 10th percentile with fewer than",
      as.character(movieId_cutoff),
      "ratings will be ignored.",
      sep=" ")

#Calculate 10th percentile of userId by rating
userId_cutoff <-
  edx %>%
  group_by(userId) %>%
  summarize(n = n()) %>%
  .$n %>%
  quantile(.10)

paste("Users below the 10th percentile with fewer than",
      as.character(userId_cutoff),
      "ratings will be ignored.",
      sep=" ")

# Remove any movie below the 10th percentile in terms of ratins
edx2 <-
  edx %>%
  group_by(movieId) %>%
  filter(n()>=movieId_cutoff) %>%
  ungroup()

# Remove any user below the 10th percentile in terms of ratings
edx2 <-
  edx2 %>%
  group_by(userId) %>%
  filter(n()>=userId_cutoff) %>%
  ungroup()

# Create partition, reserving 20% of edx set for testing purposes
test_index <-
  createDataPartition(y = edx2$rating,
                      times = 1, p = 0.2, list = FALSE)
train_set <- edx2[-test_index,]
test_set <- edx2[test_index,]

# Define loss function, and throw away any NA values that result from
RMSE <-
  function(true_ratings, predicted_ratings){
    sqrt(mean((true_ratings - predicted_ratings)^2, na.rm=TRUE))}

# Calcualte the average rating for the training set.
mu_hat <- mean(train_set$rating)

paste("Penalized User and Movie Effect Least Squares approach with a sample mean rating of",
      as.character(mu_hat),
      ", optimizing lambda to the nearest integer.",
      sep=" ")

# Using Movie and User Effects with Penalization on both, optimize lambda to the nearest integer.

lambdas <- seq(1, 10, 1)

RMSEs <- sapply(lambdas, function(l){
  b_i <-
    train_set %>%
    group_by(movieId) %>%
    summarize(b_i = sum(rating - mu_hat)/(n()+l))

  b_u <-
    train_set %>%
    left_join(b_i, by="movieId") %>%
    group_by(userId) %>%
    summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))

  predicted_ratings <-
    test_set %>%
    left_join(b_i, by = "movieId") %>%
    left_join(b_u, by = "userId") %>%
    mutate(pred = mu_hat + b_i + b_u) %>%
    pull(pred)

  return(RMSE(predicted_ratings, test_set$rating))

})

lambda_best <- lambdas[which.min(RMSEs)]

paste("Penalized LSE Model with lambda =",
      as.character(lambda_best),
      "gives an RMSE of",
      as.character(min(RMSEs)),
      "on the test set.",
      sep=" ")

# Use this optimized lambda value to test against the validation set.
paste("Now testing this optimized lambda against the vaildation set.")

l <- lambda_best
b_i <- train_set %>%
  group_by(movieId) %>%
  summarize(b_i = sum(rating - mu_hat)/(n()+l))
b_u <- train_set %>%
  left_join(b_i, by="movieId") %>%
  group_by(userId) %>%
  summarize(b_u = sum(rating - b_i - mu_hat)/(n()+l))
final_predicted_ratings <-
  validation %>%
  left_join(b_i, by = "movieId") %>%
  left_join(b_u, by = "userId") %>%
  mutate(pred = mu_hat + b_i + b_u) %>%
  pull(pred)
final_rmse <-
  RMSE(final_predicted_ratings, validation$rating)

paste("Penalized LSE Model with optimized lambda =",
      as.character(lambda_best),
      "gives an RMSE of",
      as.character(final_rmse),
      "on the validation set.",
      sep=" ")