Matrix Factorization impl 2

2022. 3. 25. 11:34AI/Big data

    목차
반응형

data loading

 

import numpy as np
import pandas as pd


r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', names=r_cols,  sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int)

 

spliting data as train/test

 

from sklearn.utils import shuffle


TRAIN_SIZE = 0.75
ratings = shuffle(ratings, random_state=1)
cutoff = int(TRAIN_SIZE * len(ratings))
ratings_train = ratings.iloc[:cutoff]
ratings_test = ratings.iloc[cutoff:]

 

처음 75%의 데이터는 training 용으로 사용하고,
나머지 25%의 데이터는 test 용으로 사용한다.
 

define Matrix Factorization class

값 초기화

 

class NEW_MF():
    def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
        self.R = np.array(ratings)

        item_id_index = []
        index_item_id = []
        for i, one_id in enumerate(ratings):
            item_id_index.append([one_id, i])
            index_item_id.append([i, one_id])

        self.item_id_index = dict(item_id_index)
        self.index_item_id = dict(index_item_id) 

        user_id_index = []
        index_user_id = []

        for i, one_id in enumerate(ratings.T):
            user_id_index.append([one_id, i])
            index_user_id.append([i, one_id])

        self.user_id_index = dict(user_id_index)
        self.index_user_id = dict(index_user_id)

        self.num_users, self.num_items = np.shape(self.R)
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        self.verbose = verbose

 

  • user id to index와 index to user id를 만든다.
  • movie id to index와 index to movie id역시 만든다.
  • hyper parameter들인 K, alpha, beta, iteration 등을 저장한다.
     

RMSE 함수 정의

 

    def rmse(self):
        xs, ys = self.R.nonzero()
        self.predictions = []
        self.errors = []

        for x, y in zip(xs, ys):
            prediction = self.get_prediction(x, y)
            self.predictions.append(prediction)
            self.errors.append(self.R[x, y] - prediction)

        self.predictions = np.array(self.predictions)
        self.errors = np.array(self.errors)
        return np.sqrt(np.mean(self.errors**2))

 

  • rating이 nonzero인것들만 모으고 이들의 movie와 user id로 P, Q를 통해 예측 값을 얻음
  • 예측된 값과 실제 rating간의 오차를 errors에 저장
  • 에러값들을 모두 제곱하여 평균을 낸 후 sqrt를 수행하여 RMSE 값을 생성
     

prediction 함수

 

    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

 

  • 예측은 전체 bias + user bias + item bias를 적용하여 계산
     

SGD 함수

 

    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

 

  • 모든 sample들에 대해서 에러를 구한 후,
  • beta를 적용한 에러 함수의 편미분 결과를 통해 P와 Q의 user, item vector를 갱신하며 학습
     

test 값 설정

 

    def set_test(self, ratings_test):
        test_set = []
        for i in range(len(ratings_test)):
            x = self.user_id_index[ratings_test.iloc[i, 0]]  # [i, 0] is user_id
            y = self.item_id_index[ratings_test.iloc[i, 1]]  # [i, 1] is movie_id
            z = ratings_test.iloc[i, 2]                      # [i, 2] is rating

            test_set.append([x, y, z])  # [user_idx, movie_idx, rating]
            self.R[x, y] = 0            # set rating 0 cuz this x-y is used as test purpose

        self.test_set = test_set
        return test_set

 

  • test 할 항목들의 user id, movie id, rating 정보를 ratings_test에 저장
  • rating matrix(R)에서 해당 user와 movie의 rating을 삭제 (하여 test 데이터를 통해 학습이 되지 않게 함)
     

test data를 가지고 RMSE를 수행

 

    def test_rmse(self):
        error = 0
        for one_set in self.test_set:    # [user_idx, movie_idx, rating]
            predicted = self.get_prediction(one_set[0], one_set[1])
            error += pow(one_set[2] - predicted, 2)

        return np.sqrt(error/len(self.test_set))

 

학습 및 test 함수

 

    def test(self):
        # Initializing user-feature and item-feature matrix
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms
        self.b_u = np.zeros(self.num_users)
        self.b_d = np.zeros(self.num_items)
        self.b = np.mean(self.R[self.R.nonzero()])

        # List of training samples
        rows, columns = self.R.nonzero()

        # all test items are now set as 0.
        self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()

            learning_rmse = self.rmse()
            test_rmse = self.test_rmse()
            training_process.append((i + 1, learning_rmse, test_rmse))

            if self.verbose:
                if (i+1) % 10 == 0:
                    print("Iteration: %d ; Train RMSE = %.4f ; Test RMSE = %.4f" % (i+1, learning_rmse, test_rmse))

        return training_process

 

  • 학습을 통해 얻은 RMSE와 test data를 통해 얻은 RMSE를 저장

prediction 함수

 

    # Ratings for given user_id and item_id
    def get_one_prediction(self, user_id, item_id):
        return self.get_prediction(self.user_id_index[user_id], self.item_id_index[item_id])

 

모든 예측치를 계산

예측된 rating matrix를 계산

 

    # Full user-movie rating matrix
    def full_prediction(self):
        return self.b + self.b_u[:,np.newaxis] + self.b_d[np.newaxis,:] + self.P.dot(self.Q.T)

 

학습 실행

 

# Testing MF RMSE
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = NEW_MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
test_set = mf.set_test(ratings_test)
result = mf.test()

 

 

# Printing predictions
print(mf.full_prediction())
print(mf.get_one_prediction(1, 2))
  • RMSE 추이

    Iteration: 10 ; Train RMSE = 0.9659 ; Test RMSE = 0.9833
    Iteration: 20 ; Train RMSE = 0.9409 ; Test RMSE = 0.9644
    Iteration: 30 ; Train RMSE = 0.9297 ; Test RMSE = 0.9566
    Iteration: 40 ; Train RMSE = 0.9230 ; Test RMSE = 0.9523
    Iteration: 50 ; Train RMSE = 0.9182 ; Test RMSE = 0.9495
    Iteration: 60 ; Train RMSE = 0.9142 ; Test RMSE = 0.9476
    Iteration: 70 ; Train RMSE = 0.9104 ; Test RMSE = 0.9460
    Iteration: 80 ; Train RMSE = 0.9061 ; Test RMSE = 0.9445
    Iteration: 90 ; Train RMSE = 0.9008 ; Test RMSE = 0.9428
    Iteration: 100 ; Train RMSE = 0.8939 ; Test RMSE = 0.9406
  • 모든 rating matrix

    [[3.73276363 3.40467801 3.09606198 ... 3.32256598 3.45924946 3.47222635]
     [3.86416818 3.50067622 3.17828652 ... 3.42326847 3.55193793 3.56823516]
     [3.31096979 2.86361587 2.52550968 ... 2.82067397 2.9452319  2.92575313]
     ...
     [4.21583289 3.76985707 3.41974008 ... 3.71000703 3.82613912 3.82795699]
     [4.3520684  3.89615659 3.54448999 ... 3.83211585 3.9435627  3.95265407]
     [3.74336291 3.38676218 3.04723229 ... 3.30290268 3.42757355 3.42599508]]
  • user 1번의 movie 2에 대한 예측 rating

    3.4046780074044887
반응형

'AI > Big data' 카테고리의 다른 글

surprise knns.KNNWithMeans  (0) 2022.03.26
Anaconda 사용법  (0) 2022.03.26
Matrix Factorization impl.  (0) 2022.03.25
Matrix Factorization 2  (0) 2022.03.23
CF 정확도 개선  (0) 2022.03.22