Matrix Factorization impl 2
2022. 3. 25. 11:34ㆍAI/Big data
- 목차
반응형
data loading
import numpy as np
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', names=r_cols, sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)
spliting data as train/test
from sklearn.utils import shuffle
TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]
처음 75%의 데이터는 training 용으로 사용하고,
나머지 25%의 데이터는 test 용으로 사용한다.
define Matrix Factorization class
값 초기화
class NEW_MF():
def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
self.R = np.array(ratings)
item_id_index = []
index_item_id = []
for i, one_id in enumerate(ratings):
item_id_index.append([one_id, i])
index_item_id.append([i, one_id])
self.item_id_index = dict(item_id_index)
self.index_item_id = dict(index_item_id)
user_id_index = []
index_user_id = []
for i, one_id in enumerate(ratings.T):
user_id_index.append([one_id, i])
index_user_id.append([i, one_id])
self.user_id_index = dict(user_id_index)
self.index_user_id = dict(index_user_id)
self.num_users, self.num_items = np.shape(self.R)
self.K = K
self.alpha = alpha
self.beta = beta
self.iterations = iterations
self.verbose = verbose
- user id to index와 index to user id를 만든다.
- movie id to index와 index to movie id역시 만든다.
- hyper parameter들인 K, alpha, beta, iteration 등을 저장한다.
RMSE 함수 정의
def rmse(self):
xs, ys = self.R.nonzero()
self.predictions = []
self.errors = []
for x, y in zip(xs, ys):
prediction = self.get_prediction(x, y)
self.predictions.append(prediction)
self.errors.append(self.R[x, y] - prediction)
self.predictions = np.array(self.predictions)
self.errors = np.array(self.errors)
return np.sqrt(np.mean(self.errors**2))
- rating이 nonzero인것들만 모으고 이들의 movie와 user id로 P, Q를 통해 예측 값을 얻음
- 예측된 값과 실제 rating간의 오차를 errors에 저장
- 에러값들을 모두 제곱하여 평균을 낸 후 sqrt를 수행하여 RMSE 값을 생성
prediction 함수
def get_prediction(self, i, j):
prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
return prediction
- 예측은 전체 bias + user bias + item bias를 적용하여 계산
SGD 함수
def sgd(self):
for i, j, r in self.samples:
prediction = self.get_prediction(i, j)
e = (r - prediction)
self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])
- 모든 sample들에 대해서 에러를 구한 후,
- beta를 적용한 에러 함수의 편미분 결과를 통해 P와 Q의 user, item vector를 갱신하며 학습
test 값 설정
def set_test(self, ratings_test):
test_set = []
for i in range(len(ratings_test)):
x = self.user_id_index[ratings_test.iloc[i, 0]] # [i, 0] is user_id
y = self.item_id_index[ratings_test.iloc[i, 1]] # [i, 1] is movie_id
z = ratings_test.iloc[i, 2] # [i, 2] is rating
test_set.append([x, y, z]) # [user_idx, movie_idx, rating]
self.R[x, y] = 0 # set rating 0 cuz this x-y is used as test purpose
self.test_set = test_set
return test_set
- test 할 항목들의 user id, movie id, rating 정보를 ratings_test에 저장
- rating matrix(R)에서 해당 user와 movie의 rating을 삭제 (하여 test 데이터를 통해 학습이 되지 않게 함)
test data를 가지고 RMSE를 수행
def test_rmse(self):
error = 0
for one_set in self.test_set: # [user_idx, movie_idx, rating]
predicted = self.get_prediction(one_set[0], one_set[1])
error += pow(one_set[2] - predicted, 2)
return np.sqrt(error/len(self.test_set))
학습 및 test 함수
def test(self):
# Initializing user-feature and item-feature matrix
self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
# Initializing the bias terms
self.b_u = np.zeros(self.num_users)
self.b_d = np.zeros(self.num_items)
self.b = np.mean(self.R[self.R.nonzero()])
# List of training samples
rows, columns = self.R.nonzero()
# all test items are now set as 0.
self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
# Stochastic gradient descent for given number of iterations
training_process = []
for i in range(self.iterations):
np.random.shuffle(self.samples)
self.sgd()
learning_rmse = self.rmse()
test_rmse = self.test_rmse()
training_process.append((i + 1, learning_rmse, test_rmse))
if self.verbose:
if (i+1) % 10 == 0:
print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, learning_rmse, test_rmse))
return training_process
- 학습을 통해 얻은 RMSE와 test data를 통해 얻은 RMSE를 저장
prediction 함수
# Ratings for given user_id and item_id
def get_one_prediction(self, user_id, item_id):
return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])
모든 예측치를 계산
예측된 rating matrix를 계산
# Full user-movie rating matrix
def full_prediction(self):
return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)
학습 실행
# Testing MF RMSE
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = NEW_MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()
# Printing predictions
print(mf.full_prediction())
print(mf.get_one_prediction(1, 2))
RMSE 추이
Iteration: 10 ; Train RMSE = 0.9659 ; Test RMSE = 0.9833 Iteration: 20 ; Train RMSE = 0.9409 ; Test RMSE = 0.9644 Iteration: 30 ; Train RMSE = 0.9297 ; Test RMSE = 0.9566 Iteration: 40 ; Train RMSE = 0.9230 ; Test RMSE = 0.9523 Iteration: 50 ; Train RMSE = 0.9182 ; Test RMSE = 0.9495 Iteration: 60 ; Train RMSE = 0.9142 ; Test RMSE = 0.9476 Iteration: 70 ; Train RMSE = 0.9104 ; Test RMSE = 0.9460 Iteration: 80 ; Train RMSE = 0.9061 ; Test RMSE = 0.9445 Iteration: 90 ; Train RMSE = 0.9008 ; Test RMSE = 0.9428 Iteration: 100 ; Train RMSE = 0.8939 ; Test RMSE = 0.9406
모든 rating matrix
[[3.73276363 3.40467801 3.09606198 ... 3.32256598 3.45924946 3.47222635] [3.86416818 3.50067622 3.17828652 ... 3.42326847 3.55193793 3.56823516] [3.31096979 2.86361587 2.52550968 ... 2.82067397 2.9452319 2.92575313] ... [4.21583289 3.76985707 3.41974008 ... 3.71000703 3.82613912 3.82795699] [4.3520684 3.89615659 3.54448999 ... 3.83211585 3.9435627 3.95265407] [3.74336291 3.38676218 3.04723229 ... 3.30290268 3.42757355 3.42599508]]
user 1번의 movie 2에 대한 예측 rating
3.4046780074044887
반응형
'AI > Big data' 카테고리의 다른 글
surprise knns.KNNWithMeans (0) | 2022.03.26 |
---|---|
Anaconda 사용법 (0) | 2022.03.26 |
Matrix Factorization impl. (0) | 2022.03.25 |
Matrix Factorization 2 (0) | 2022.03.23 |
CF 정확도 개선 (0) | 2022.03.22 |