#!/usr/bin/env python2 # -*- coding: utf-8 -*- """ Created on Sun Oct 16 16:20:58 2016 @author: matt-666 """ from lightfm.datasets import movielens from lightfm import LightFM from lightfm.evaluation import precision_at_k import numpy as np import matplotlib.pyplot as plt from scipy.sparse.linalg import svds import time from scipy.sparse import coo_matrix, csr_matrix, eye, diags, csc_matrix from scipy.sparse.linalg import spsolve import pandas as pd import boto3 import json def bm25_weight(data, K1=100, B=0.8): """ Weighs each row of the matrix data by BM25 weighting """ # calculate idf per term (user) N = float(data.shape[0]) idf = np.log(N / (1 + np.bincount(data.col))) # calculate length_norm per document (artist) row_sums = np.squeeze(np.asarray(data.sum(1))) average_length = row_sums.sum() / N length_norm = (1.0 - B) + B * row_sums / average_length # weight matrix rows by bm25 ret = coo_matrix(data) ret.data = ret.data * (K1 + 1.0) / (K1 * length_norm[ret.row] + ret.data) * idf[ret.col] return ret # Alternating Least squares def alternating_least_squares(Cui, factors, regularization=0.01, iterations=15, use_native=True, num_threads=0, dtype=np.float64): """ factorizes the matrix Cui using an implicit alternating least squares algorithm Args: Cui (csr_matrix): Confidence Matrix factors (int): Number of factors to extract regularization (double): Regularization parameter to use iterations (int): Number of alternating least squares iterations to run num_threads (int): Number of threads to run least squares iterations. 0 means to use all CPU cores. Returns: tuple: A tuple of (row, col) factors """ #_check_open_blas() users, items = Cui.shape X = np.random.rand(users, factors).astype(dtype) * 0.01 Y = np.random.rand(items, factors).astype(dtype) * 0.01 Cui, Ciu = Cui.tocsr(), Cui.T.tocsr() solver = least_squares for iteration in range(iterations): s = time.time() solver(Cui, X, Y, regularization, num_threads) solver(Ciu, Y, X, regularization, num_threads) print "finished iteration %i in %s" % (iteration, time.time() - s) return X, Y def least_squares(Cui, X, Y, regularization, num_threads): """ For each user in Cui, calculate factors Xu for them using least squares on Y. Note: this is at least 10 times slower than the cython version included here. """ users, factors = X.shape YtY = Y.T.dot(Y) for u in range(users): # accumulate YtCuY + regularization*I in A A = YtY + regularization * np.eye(factors) # accumulate YtCuPu in b b = np.zeros(factors) for i, confidence in nonzeros(Cui, u): factor = Y[i] A += (confidence - 1) * np.outer(factor, factor) b += confidence * factor # Xu = (YtCuY + regularization * I)^-1 (YtCuPu) X[u] = np.linalg.solve(A, b) def nonzeros(m, row): """ returns the non zeroes of a row in csr_matrix """ for index in range(m.indptr[row], m.indptr[row+1]): yield m.indices[index], m.data[index] class TopRelated_useruser(object): def __init__(self, user_factors): # fully normalize artist_factors, so can compare with only the dot product norms = np.linalg.norm(user_factors, axis=-1) self.factors = user_factors / norms[:, np.newaxis] def get_related(self, movieid, N=10): scores = self.factors.dot(self.factors[movieid]) best = np.argpartition(scores, -N)[-N:] return sorted(zip(best, scores[best]), key=lambda x: -x[1]) class TopRelated_itemitem(object): def __init__(self, movie_factors): # fully normalize artist_factors, so can compare with only the dot product norms = np.linalg.norm(movie_factors, axis=-1) self.factors = movie_factors / norms[:, np.newaxis] def get_related(self, movieid, N=10): scores = self.factors.T.dot(self.factors.T[movieid]) best = np.argpartition(scores, -N)[-N:] return sorted(zip(best, scores[best]), key=lambda x: -x[1]) class ImplicitMF(): def __init__(self, counts, num_factors=40, num_iterations=30, reg_param=0.8): self.counts = counts self.num_users = counts.shape[0] self.num_items = counts.shape[1] self.num_factors = num_factors self.num_iterations = num_iterations self.reg_param = reg_param def train_model(self): self.user_vectors = np.random.normal(size=(self.num_users, self.num_factors)) self.item_vectors = np.random.normal(size=(self.num_items, self.num_factors)) for i in xrange(self.num_iterations): t0 = time.time() print 'Solving for user vectors...' self.user_vectors = self.iteration(True, csr_matrix(self.item_vectors)) print 'Solving for item vectors...' self.item_vectors = self.iteration(False, csr_matrix(self.user_vectors)) t1 = time.time() print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0) def iteration(self, user, fixed_vecs): num_solve = self.num_users if user else self.num_items num_fixed = fixed_vecs.shape[0] YTY = fixed_vecs.T.dot(fixed_vecs) eye1 = eye(num_fixed) lambda_eye = self.reg_param * eye(self.num_factors) solve_vecs = np.zeros((num_solve, self.num_factors)) t = time.time() for i in xrange(num_solve): if user: counts_i = self.counts[i].toarray() else: counts_i = self.counts[:, i].T.toarray() CuI = diags(counts_i, [0]) pu = counts_i.copy() pu[np.where(pu != 0)] = 1.0 YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs) YTCupu = fixed_vecs.T.dot(CuI + eye1).dot(csr_matrix(pu).T) xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu) solve_vecs[i] = xu if i % 1000 == 0: print 'Solved %i vecs in %d seconds' % (i, time.time() - t) t = time.time() return solve_vecs # ============================================================================== # on beer data ================================================================= beer_data = pd.read_csv('beer_reviews/beer_reviews.csv') test_data = beer_data.groupby('review_profilename', as_index=False).apply(lambda x: x.loc[np.random.choice(x.index, 1, replace=False),:]) l1 = [x[1] for x in test_data.index.tolist()] train_data = beer_data.drop(beer_data.index[l1]).dropna() train_data['review_profilename'] = train_data['review_profilename'].astype("category") train_data['beer_name'] = train_data['beer_name'].astype("category") print "Unique users: %s" % (len(train_data['review_profilename'].unique())) print "Unique beers: %s" % (len(train_data['beer_name'].unique())) # create a sparse matrix of all the artist/user/play triples reviews = csc_matrix((train_data['review_overall'].astype(float), (train_data['beer_name'].cat.codes, train_data['review_profilename'].cat.codes))) beerid2beername = dict(enumerate(train_data['beer_name'].cat.categories)) beername2beerid = {v: k for k, v in beerid2beername.items()} userid2username = dict(enumerate(train_data['review_profilename'].cat.categories)) username2userid = {v: k for k, v in userid2username.items()} #SVD ============ denseVecSize = 25 beer_factors, s, userbeer_factors = svds(bm25_weight(reviews.tocoo()), denseVecSize) Related_beers_ii_svd25 = TopRelated_itemitem(beer_factors.T) denseVecSize = 50 beer_factors_50, s, userbeer_factors_50 = svds(bm25_weight(reviews.tocoo()), denseVecSize) Related_beers_ii_svd50 = TopRelated_itemitem(beer_factors_50.T) denseVecSize = 100 beer_factors_100, s, userbeer_factors_100 = svds(bm25_weight(reviews.tocoo()), denseVecSize) Related_beers_ii_svd100 = TopRelated_itemitem(beer_factors_100.T) # Implicit ========================= impl = ImplicitMF(reviews.tocsr()) impl.train_model() # user vectors is beers impl_ii = TopRelated_itemitem(impl.user_vectors.T) # ALS ================= als_userbeer_factors, als_beer_factors = alternating_least_squares(bm25_weight(reviews.tocoo()), 50) als_ii = TopRelated_itemitem(als_userbeer_factors.T) # Push to s3 - This won't work unless you set up amazon CLI on your computer s3 = boto3.resource('s3') for i in range(beer_factors.shape[0]): beer_recs_ii_svd25 = [{"value":beerid2beername[rec[0]].replace('"',''), "users":rec[1]} for rec in Related_beers_ii_svd25.get_related(i)] beer_recs_ii_svd50 = [{"value":beerid2beername[rec[0]].replace('"',''), "users":rec[1]} for rec in Related_beers_ii_svd50.get_related(i)] beer_recs_ii_svd100 = [{"value":beerid2beername[rec[0]].replace('"',''), "users":rec[1]} for rec in Related_beers_ii_svd100.get_related(i)] beer_recs_impl_ii = [{"value":beerid2beername[rec[0]].replace('"',''), "users":rec[1]} for rec in impl_ii.get_related(i)] beer_recs_als_ii = [{"value":beerid2beername[rec[0]].replace('"',''), "users":rec[1]} for rec in als_ii.get_related(i)] # remove spaces beername = beerid2beername[i].replace(' ', '_').replace('"','') beer_recs = {"svd25-item":beer_recs_ii_svd25, "implicit":beer_recs_impl_ii, "svd50-item":beer_recs_ii_svd50, "als":beer_recs_als_ii, "svd100-item":beer_recs_ii_svd100} # jsonify - just works better - always get double quotes etc with open('temp.json', 'wb') as fp: json.dump(beer_recs, fp) try: s3.Object('beer-reco', beername+'.json').put(Body=open('temp.json', 'rb'), ACL='public-read') except UnicodeDecodeError: print ("can't assign: %s" % (beername)) # Get top ~10k count_data = beer_data['beer_name'].value_counts() count_data_top10k = count_data[count_data>15] beer_recs_all = [] for index, value in count_data_top10k.iteritems(): beer_recs_all.append({"value":index, "users":value}) s3.Object('beer-reco', 'top10k').put(Body = str(beer_recs_all), ACL='public-read', ContentType='string') # if movies are your thing ==================================================== movie_data = movielens.fetch_movielens() n_users, n_items = movie_data['train'].shape model = LightFM(loss='warp') model.fit(movie_data['train'], epochs=30, num_threads=2, user_features=None, item_features=None) print("Train precision: %.2f" % precision_at_k(model, movie_data['train'], k=5).mean()) print("Test precision: %.2f" % precision_at_k(model, movie_data['test'], k=5).mean()) train_prec, test_prec = [], [] epochs = np.linspace(1, 30, 30) for epoch in epochs: model.fit(movie_data['train'], epochs = int(epoch), num_threads = 2) train_prec.append(precision_at_k(model, movie_data['train'], k=5).mean()) test_prec.append(precision_at_k(model, movie_data['test'], k=5).mean()) print(epoch) plt.figure() plt.plot(epochs, train_prec) plt.plot(epochs, test_prec, 'g') # Find most popular md = movie_data['train'].toarray() md[np.isnan(md)] = 0 nonzero_ratings = np.ma.masked_array(md, md==0).mean(axis=0) top_movies = np.asarray(np.argsort(nonzero_ratings))[-22:-2] print 'Top Movies: %s' % movie_data['item_labels'][top_movies] # svd ====================================================================== denseVecSize = 25 user_factors, s, movie_factors = svds(bm25_weight(movie_data['train']), denseVecSize) Related_uu = TopRelated_useruser(user_factors) mov = 1 print 'Movie pick: %s' % movie_data['item_labels'][mov] recs = [rec[0] for rec in Related_uu.get_related(mov)] print 'Recomendations: %s' % movie_data['item_labels'][recs[1:]] Related_ii = TopRelated_itemitem(movie_factors) print 'Movie pick: %s' % movie_data['item_labels'][mov] recs = [rec[0] for rec in Related_ii.get_related(mov)] print 'Recomendations: %s' % movie_data['item_labels'][recs[1:]] # Implicit MF ============================================================================ gg= ImplicitMF(movie_data['train'].tocsr()) gg.train_model() gg_ii = TopRelated_itemitem(gg.item_vectors.T) print 'Movie pick: %s' % movie_data['item_labels'][mov] recs_gg = [rec[0] for rec in gg_ii.get_related(mov)] print 'Recomendations: %s' % movie_data['item_labels'][recs_gg[1:]] # Alernating Least squares ============================================================== als_user_factors, als_movie_factors = alternating_least_squares(bm25_weight(movie_data['train']), 50) als_ii = TopRelated_itemitem(als_movie_factors.T) print 'Movie pick: %s' % movie_data['item_labels'][mov] als_ii = [rec[0] for rec in als_ii.get_related(mov)] print 'Recomendations: %s' % movie_data['item_labels'][als_ii[1:]]