import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_splitMovie Recommendations
Data from https://grouplens.org/datasets/movielens/
I chose the smallest dataset (100k) as this project is for learning pytorch, not necessarily creating the best possible model.
# !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
# !unzip ml-latest-small.zip
import pandas as pd
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()| movieId | title | genres | |
|---|---|---|---|
| 0 | 1 | Toy Story (1995) | Adventure|Animation|Children|Comedy|Fantasy |
| 1 | 2 | Jumanji (1995) | Adventure|Children|Fantasy |
| 2 | 3 | Grumpier Old Men (1995) | Comedy|Romance |
| 3 | 4 | Waiting to Exhale (1995) | Comedy|Drama|Romance |
| 4 | 5 | Father of the Bride Part II (1995) | Comedy |
users = ratings.userId.unique()
movies = ratings.movieId.unique()
# users is 1-610 no gaps, movies is 1-163981 w gaps
movies_map = {id:idx for idx, id in enumerate(movies)}
users_map = {id:idx for idx, id in enumerate(users)}
ratings["mappedMovie"] = ratings.movieId.map(movies_map)
ratings["userId"] = ratings.userId.map(users_map)
ratings.drop(columns=["movieId"], inplace=True)X = ratings[["userId", "mappedMovie"]]
y = ratings["rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=X["userId"])class MovieRecNet(nn.Module):
def __init__(self, n_movies, n_users, embed_dim, n_hidden1, n_hidden2):
super().__init__()
self.movie_embedding = nn.Embedding(n_movies, embed_dim)
self.user_embedding = nn.Embedding(n_users, embed_dim)
self.embedding_dim = embed_dim
self.deep_stack = nn.Sequential(
nn.Linear(embed_dim * 2, n_hidden1), nn.ReLU(), nn.Dropout(0.3),
nn.Linear(n_hidden1, n_hidden2), nn.ReLU(), nn.Dropout(0.3)
)
self.output_layer = nn.Linear(n_hidden2+embed_dim, 1)
def forward(self, user_ids, movie_ids):
user_vector = self.user_embedding(user_ids)
movie_vector = self.movie_embedding(movie_ids)
x = torch.concat([user_vector, movie_vector], dim=1)
dot = user_vector * movie_vector
deep_output = self.deep_stack(x)
last_input = torch.concat([deep_output, dot], dim=1)
return self.output_layer(last_input) device = "cuda"
def train(model, optimizer, criterion, train_loader, n_epochs):
model.train()
for epoch in range(n_epochs):
total_loss = 0.0
for user_batch, movie_batch, y_batch in train_loader:
user_batch = user_batch.to(device)
movie_batch = movie_batch.to(device)
y_batch = y_batch.to(device)
y_pred = model(user_batch, movie_batch)
loss = criterion(y_pred.squeeze(), y_batch.squeeze())
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
mean_loss = total_loss / len(train_loader)
print(f"Epoch {epoch} : Loss {round(mean_loss,4)}")from torch.utils.data import TensorDataset, DataLoader
X_user = torch.tensor(X_train["userId"].values, dtype=torch.long)
X_movie = torch.tensor(X_train["mappedMovie"].values, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
train_dataset = TensorDataset(X_user, X_movie, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
n_movies = len(movies)
n_users = len(users)
model = MovieRecNet(n_movies, n_users, 64, 128, 128).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=.01)
crit = nn.MSELoss()
train(model, optimizer, crit, train_loader, 10)Epoch 0 : Loss 1.1966
Epoch 1 : Loss 1.0257
Epoch 2 : Loss 0.989
Epoch 3 : Loss 0.9669
Epoch 4 : Loss 0.9503
Epoch 5 : Loss 0.9391
Epoch 6 : Loss 0.9274
Epoch 7 : Loss 0.9196
Epoch 8 : Loss 0.9136
Epoch 9 : Loss 0.9054
X_user = torch.tensor(X_test["userId"].values, dtype=torch.long).to(device)
X_movie = torch.tensor(X_test["mappedMovie"].values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)
with torch.no_grad():
pred = model(X_user, X_movie)
torch.sqrt(torch.mean((y_test_tensor - pred)**2))tensor(1.1405, device='cuda:0')
Our model ends with a RMSE of 1.14. Not great in terms of perfect predictions, but this model is probably still useful for determining which movies a user is likely to heavily dislike vs enjoy a lot. Could use this to get a list of high (>4/5) predicted movies.