Matrix Factorization impl.
2022. 3. 25. 10:11ㆍAI/Big data
- 목차
반응형
rij hat = pi^T qj = Σ pik qkj
예측 오차 eij = (rij - rij hat)
RMSE
eij^2 = (rij - rij hat)^2
편미분
p와q의 update
overfitting을 막기 위해 regularization 항 추가
여기서 beta는 정규화 계수
(역시 경험에 따른 적절한 숫자를 지정)
이를 편미분
이렇게 학습
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('u.data', names=r_cols, sep='\t',encoding='latin-1')
ratings = ratings[['user_id', 'movie_id', 'rating']].astype(int) # timestamp 제거
class MF():
def __init__(self, ratings, K, alpha, beta, iterations, verbose=True):
self.R = np.array(ratings)
self.num_users, self.num_items = np.shape(self.R)
self.K = K
self.alpha = alpha
self.beta = beta
self.iterations = iterations
self.verbose = verbose
# Root Mean Squared Error (RMSE) 계산
def rmse(self):
xs, ys = self.R.nonzero()
self.predictions = []
self.errors = []
for x, y in zip(xs, ys):
prediction = self.get_prediction(x, y)
self.predictions.append(prediction)
self.errors.append(self.R[x, y] - prediction)
self.predictions = np.array(self.predictions)
self.errors = np.array(self.errors)
return np.sqrt(np.mean(self.errors**2))
def train(self):
# Initializing user-feature and item-feature matrix
# loc(center)=0.0, scale: standard deviation (def=1), size=output shape
self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
# Initializing the bias terms
self.b_u = np.zeros(self.num_users) # user개의 0값을 지닌 백터를 생성
self.b_d = np.zeros(self.num_items)
self.b = np.mean(self.R[self.R.nonzero()]) # nonzero들의 좌표에 값만 가져와 평균구함
# List of training samples
rows, columns = self.R.nonzero() # nonzero의 index들의 rows와 column
# transpose를 하면 ((row, col), (row, col),. ..)의 형태가 되나 여기선 사용 안함
self.samples = [(i, j, self.R[i,j]) for i, j in zip(rows, columns)]
# Stochastic gradient descent for given number of iterations
training_process = []
for i in range(self.iterations):
np.random.shuffle(self.samples) # sample들의 값에 위치를 random하게 섞음
self.sgd()
rmse = self.rmse()
training_process.append((i+1, rmse))
if self.verbose:
if (i+1) % 10 == 0:
print("Iteration: %d ; Train RMSE = %.4f " % (i+1, rmse))
return training_process
# Rating prediction for user i and item j
def get_prediction(self, i, j):
prediction = self.b + self.b_u[i] + self.b_d[j] + self.P[i, :].dot(self.Q[j, :].T)
return prediction
# Stochastic gradient descent to get optimized P and Q matrix
def sgd(self):
for i, j, r in self.samples:
prediction = self.get_prediction(i, j)
e = (r - prediction)
self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
self.b_d[j] += self.alpha * (e - self.beta * self.b_d[j])
# 수식에 따른 P와 Q의 백터 update
self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])
R_temp = ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
mf = MF(R_temp, K=30, alpha=0.001, beta=0.02, iterations=100, verbose=True)
train_process = mf.train()
Iteration: 10 ; Train RMSE = 0.9585
Iteration: 20 ; Train RMSE = 0.9374
Iteration: 30 ; Train RMSE = 0.9281
Iteration: 40 ; Train RMSE = 0.9225
Iteration: 50 ; Train RMSE = 0.9184
Iteration: 60 ; Train RMSE = 0.9146
Iteration: 70 ; Train RMSE = 0.9102
Iteration: 80 ; Train RMSE = 0.9042
Iteration: 90 ; Train RMSE = 0.8956
Iteration: 100 ; Train RMSE = 0.8839
반응형
'AI > Big data' 카테고리의 다른 글
Anaconda 사용법 (0) | 2022.03.26 |
---|---|
Matrix Factorization impl 2 (0) | 2022.03.25 |
Matrix Factorization 2 (0) | 2022.03.23 |
CF 정확도 개선 (0) | 2022.03.22 |
CF considering user bias (0) | 2022.03.22 |