1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
| import os
import pandas as pd import numpy as np
DATA_PATH = "./data/ml-latest-small/ratings.csv" CACHE_DIR = "./data/"
def load_data(data_path): ''' 加载数据 :param data_path: 数据集路径 :param cache_path: 数据集缓存路径 :return: 用户-物品评分矩阵 ''' cache_path = os.path.join(CACHE_DIR, "ratings_matrix.cache")
print("开始加载数据集...") if os.path.exists(cache_path): print("加载缓存中...") ratings_matrix = pd.read_pickle(cache_path) print("从缓存加载数据集完毕") else: print("加载新数据中...") dtype = {"userId": np.int32, "movieId": np.int32, "rating": np.float32} ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3)) ratings_matrix = ratings.pivot_table(index=["userId"], columns=["movieId"], values="rating") ratings_matrix.to_pickle(cache_path) print("数据集加载完毕") return ratings_matrix
def compute_pearson_similarity(ratings_matrix): ''' 计算用户皮尔逊相关系数 :param ratings_matrix: 用户-物品评分矩阵 :return: 相似度矩阵 ''' user_similarity_cache_path = os.path.join(CACHE_DIR, "user_similarity.cache")
if os.path.exists(user_similarity_cache_path): print("正从缓存加载用户相似度矩阵") similarity = pd.read_pickle(user_similarity_cache_path) else: print("开始计算用户相似度矩阵") similarity = ratings_matrix.T.corr() similarity.to_pickle(user_similarity_cache_path)
print("相似度矩阵计算/加载完毕") return similarity
def predict(uid, iid, ratings_matrix, user_similar): ''' 预测给定用户对给定物品的评分值 :param uid: 用户ID :param iid: 物品ID :param ratings_matrix: 用户-物品评分矩阵 :param user_similar: 用户两两相似度矩阵 :return: 预测的评分值 ''' print("开始预测用户<%d>对电影<%d>的评分..."%(uid, iid)) similar_users = user_similar[uid].drop([uid]).dropna().sort_values(ascending=False)[:25] similar_users = similar_users.where(similar_users>0).dropna() if similar_users.empty is True: raise Exception("用户<%d>没有相似的用户" % uid)
ids = set(ratings_matrix[iid].dropna().index)&set(similar_users.index) finally_similar_users = similar_users.ix[list(ids)]
if finally_similar_users.empty is True: raise Exception("用户<%d>相似的用户没有对<%d>电影的评分" % (uid, iid))
sum_up = 0 sum_down = 0 for sim_uid, similarity in finally_similar_users.iteritems(): sim_user_rating_for_item = ratings_matrix.ix[sim_uid, iid] sum_up += similarity * sim_user_rating_for_item sum_down += similarity
predict_rating = sum_up/sum_down print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating)) return round(predict_rating, 2)
if __name__ == '__main__': ratings_matrix = load_data(DATA_PATH) print(ratings_matrix.head()) user_similar = compute_pearson_similarity(ratings_matrix) print(user_similar.head()) predict(1, 1, ratings_matrix, user_similar) predict(1, 2, ratings_matrix, user_similar)
|