01 - Getting Started
004 [Activity] Install Anaconda, course materials, and create movie recommendations!
005 Course Roadmap
006 What Is a Recommender System
007 Types of Recommenders
008 Understanding You through Implicit and Explicit Ratings
009 Top-N Recommender Architecture
010 [Quiz] Review the basics of recommender systems
03 - Evaluating Recommender Systems
001 TrainTest and Cross Validation
002 Accuracy Metrics (RMSE, MAE)
003 Top-N Hit Rate - Many Ways
004 Coverage, Diversity, and Novelty
005 Churn, Responsiveness, and AB Tests
006 [Quiz] Review ways to measure your recommender
007 [Activity] Walkthrough of RecommenderMetrics.py
import itertools
# conda install -c conda-forge scikit-surprise
from surprise import accuracy
from collections import defaultdict
# 推荐指标类
class RecommenderMetrics:
def MAE(predictions):
return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
return accuracy.rmse(predictions, verbose=False)
def GetTopN(predictions, n=10, minimumRating=4.0):
topN = defaultdict(list)
for userID, movieID, actualRating, estimatedRating, _ in predictions:
# 如果系统评估的分数大于最低评分阈值, 就加入到topN
if (estimatedRating >= minimumRating):
topN[int(userID)].append((int(movieID), estimatedRating))
# topN: [userID: (movieID, esRating), ...]
for userID, ratings in topN.items():
# 根据电影评分排序
ratings.sort(key=lambda x: x[1], reverse=True)
topN[int(userID)] = ratings[:n]
return topN
# 留一
def HitRate(topNPredicted, leftOutPredictions):
hits = 0
total = 0
for leftOut in leftOutPredictions:
userID = leftOut[0]
leftOutMovieID = leftOut[1]
# 该leftout是否是topN里的
hit = False
for movieID, predictedRating in topNPredicted[int(userID)]:
if (int(leftOutMovieID) == int(movieID)):
hit = True
break
if (hit):
hits += 1
total += 1
# Compute overall precision
return hits / total
# 累积命中率
def CumulativeHitRate(topNPredicted, leftOutPredictions, ratingCutoff=0):
hits = 0
total = 0
# For each Left-out rating
for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
# OnLy Look at ability to recommend things the users actually liked..
if (actualRating >= ratingCutoff):
# Is it in the predicted top 10 for this user?
hit = False
for movieID, predictedRating in topNPredicted[int(userID)]:
if (int(leftOutMovieID) == int(movieID)):
hit = True
break
if (hit):
hits += 1
total += 1
# Compute overall precision
return hits / total
def RatingHitRate(topNPredicted, leftOutPredictions):
hits = defaultdict(float)
total = defaultdict(float)
# For each Left-out rating
for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
# Is it in the predicted top 10 for this user?
hit = False
for movieID, predictedRating in topNPredicted[int(userID)]:
if (int(leftOutMovieID) == int(movieID)):
hit = True
break
if (hit):
hits[actualRating] += 1
total[actualRating] += 1
# Compute overall precision
result = {}
for rating in sorted(hits.keys()):
# print(rating, hits[rating] / total[rating])
result[rating] = hits[rating] / total[rating]
return result
def AverageReciprocalHitRank(topNPredicted, leftOutPredictions):
summation = 0
total = 0
# For each left-out rating
for userID, leftOutMovieID, actualRating, estimatedRating, _ in leftOutPredictions:
# Is it in the predicted top N for this user?
hitRank = 0
rank = 0
for movieID, predictedRating in topNPredicted[int(userID)]:
rank = rank + 1
# 如果推荐的命中了, 就 +1 hitRank
if (int(leftOutMovieID) == movieID):
hitRank = rank
break
if (hitRank > 0):
summation += 1.0 / hitRank
total += 1
return summation / total
# 用户覆盖率
# What percentage of users have at least one "good" recommendation
def UserCoverage(topNPredicted, numUsers, ratingThreshold=0):
hits = 0
for userID in topNPredicted.keys():
hit = False
for movieID, predictedRating in topNPredicted[userID]:
if (predictedRating >= ratingThreshold):
hit = True
break
if (hit):
hits += 1
return hits / numUsers
# 多样性
def Diversity(topNPredicted, simsAlgo):
n = 0
total = 0
simsMatrix = simsAlgo.compute_similarities()
for userID in topNPredicted.keys():
# 创建一个迭代器,返回iterable中所有长度为r的子序列,
# 返回的子序列中的项按输入iterable中的顺序排序 (不带重复).
# r 指定生成排列的元素的长度,如果不指定,则默认为可迭代对象的元素长度。
pairs = itertools.combinations(topNPredicted[userID], 2)
for pair in pairs:
movie1 = pair[0][0]
movie2 = pair[1][0]
innerID1 = simsAlgo.trainset.to_inner_iid(str(movie1))
innerID2 = simsAlgo.trainset.to_inner_iid(str(movie2))
# 求相似度
similarity = simsMatrix[innerID1][innerID2]
total += similarity
n += 1
S = total / n
return (1 - S)
# 新颖性
def Novelty(topNPredicted, rankings):
n = 0
total = 0
for userID in topNPredicted.keys():
for rating in topNPredicted[userID]:
movieID = rating[0]
rank = rankings[movieID]
total += rank
n += 1
return total / n
008 [Activity] Walkthrough of TestMetrics.py
# MovieLens.py
import os
import csv
import sys
import re
from surprise import Dataset
from surprise import Reader
from collections import defaultdict
import numpy as np
class MovieLens:
movieID_to_name = {}
name_to_movieID = {}
ratingsPath = '../ml-latest-small/ratings.csv'
moviesPath = '../ml-latest-small/movies.csv'
def loadMovieLensLatestSmall(self):
'''
加载电影数据, 编码为 电影名和电影ID 的对应字典
:return:
'''
# Look for files relative to the directory we are running from
os.chdir(os.path.dirname(sys.argv[0]))
ratingsDataset = 0
self.movieID_to_name = {}
self.name_to_movieID = {}
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
ratingsDataset = Dataset.load_from_file(self.ratingsPath, reader=reader)
with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
movieReader = csv.reader(csvfile)
next(movieReader) # Skip header Line
for row in movieReader:
movieID = int(row[0])
movieName = row[1]
self.movieID_to_name[movieID] = movieName
self.name_to_movieID[movieName] = movieID
return ratingsDataset
def getUserRatings(self, user):
userRatings = []
hitUser = False
with open(self.ratingsPath, newline='') as csvfile:
ratingReader = csv.reader(csvfile)
next(ratingReader)
for row in ratingReader:
userID = int(row[0])
if (user == userID):
movieID = int(row[1])
rating = float(row[2])
userRatings.append((movieID, rating))
hitUser = True
if (hitUser and (user != userID)):
break
return userRatings
def getPopularityRanks(self):
ratings = defaultdict(int)
rankings = defaultdict(int)
with open(self.ratingsPath, newline='') as csvfile:
ratingReader = csv.reader(csvfile)
next(ratingReader)
for row in ratingReader:
movieID = int(row[1])
ratings[movieID] += 1
rank = 1
# 根据 rating[1](movie出现次数) 排序, 然后遍历, 将出现次数改为 rank(排名)
for movieID, ratingCount in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
rankings[movieID] = rank # 设置排名
rank += 1 # 下一个的排名变为当前rank+1
return rankings
def getGenres(self):
'''
标签
:return:
'''
genres = defaultdict(list)
genreIDs = {} # 标签toID的字典
maxGenreID = 0 # 当前最大的ID号
with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
movieReader = csv.reader(csvfile)
next(movieReader) # skip header Line
for row in movieReader:
movieID = int(row[0])
genreList = row[2].split('|')
genreIDList = []
# 将文本形式的标签list, 转为ID数字形式
for genre in genreList:
# 如果已经编码在ID里
if genre in genreIDs:
# 使用已经编码的ID
genreID = genreIDs[genre]
else:
# 给标签编码
genreID = maxGenreID
genreIDs[genre] = genreID
maxGenreID += 1
genreIDList.append(genreID)
genres[movieID] = genreIDList
# Convert integer-encoded genre lists to bitfields that we can treat as vectors
for (movieID, genreIDList) in genres.items():
bitfield = [0] * maxGenreID
for genreID in genreIDList:
bitfield[genreID] = 1
genres[movieID] = bitfield
return genres
def getYears(self):
'''
编码 电影ID到上映年份 的字典
:return:
'''
p = re.compile(r"(?:\((\d{4})\))?\s*$")
years = defaultdict(int)
with open(self.moviesPath, newline='', encoding='ISO-8859-1') as csvfile:
movieReader = csv.reader(csvfile)
next(movieReader)
for row in movieReader:
movieID = int(row[0])
title = row[1] # like: Toy Story (1995)
m = p.search(title) # 使用正则
year = m.group(1)
if year:
years[movieID] = int(year)
return years
def getMiseEnScene(self):
'''
读取日志
:return:
'''
mes = defaultdict(list)
with open("LLVisualFeatures13K_Log.csv", newline='') as csvfile:
mesReader = csv.reader(csvfile)
next(mesReader)
for row in mesReader:
movieID = int(row[0])
avgShotLength = float(row[1])
meanColorVariance = float(row[2])
stddevColorVariance = float(row[3])
meanMotion = float(row[4])
stddevMotion = float(row[5])
meanLightingKey = float(row[6])
numShots = float(row[7])
mes[movieID] = [avgShotLength, meanColorVariance, stddevColorVariance,
meanMotion, stddevMotion,
meanLightingKey, numShots]
return mes
def getMovieName(self, movieID):
if movieID in self.movieID_to_name:
return self.movieID_to_name[movieID]
else:
return ""
def getMovieID(self, movieName):
if movieName in self.name_to_movieID:
return self.name_to_movieID[movieName]
else:
return 0
# TestMetrics.py
from MovieLens import MovieLens
from surprise import SVD
from surprise import KNNBaseline
from surprise.model_selection import train_test_split, LeaveOneOut
from RecommenderMetrics import RecommenderMetrics
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall() # return surprise.Dataset
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
print("\nComputing item similarities so we can measure diversity later...")
fullTrainSet = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=sim_options) # KNN 用于计算相似度
simsAlgo.fit(fullTrainSet)
print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(data, test_size=.25, random_state=1)
algo = SVD(random_state=10) # 将输入数据矩阵进行降维, 提取特征, 提高计算性能
algo.fit(trainSet)
print("\nComputing recommendations...")
predictions = algo.test(testSet)
print("\nEvaluating accuracy of model...")
print("RMSE:", RecommenderMetrics.RMSE(predictions))
print("MAE:", RecommenderMetrics.MAE(predictions))
print("\nEvaluating top-10 recommendations...")
# Set aside one rating per user for testing
LOOCV = LeaveOneOut(n_splits=1, random_state=1) # 交叉验证迭代器
for trainSet, testSet in LOOCV.split(data):
print("Computing recommendations with leave-one-out...")
# Train model without Left-out ratings
algo.fit(trainSet)
# Predicts ratings for Left-out ratings only
print("Predict ratings for left-out set...")
leftOutPredictions = algo.test(testSet)
# Build predictions for all ratings not in the training set
print("Predict all missing ratings...")
# Return a list of ratings that can be used as a testset in the
# :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
# method.
bigTestSet = trainSet.build_anti_testset()
allPredictions = algo.test(bigTestSet)
# Compute top 10 recs for each user
print("Compute top 10 recs per user...")
topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n=10)
# See how often we recommended a movie the user actually rated
print("\nHit Rate:", RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions))
# Break down hit rate by rating value
print("\nrHR (Hit Rate by Rating value):",
RecommenderMetrics.RatingHitRate(topNPredicted, leftOutPredictions))
# See how often we recommended a movie the user actually Liked
print("\ncHR (Cumulative Hit Rate,rating >= 4):",
RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions, 4.0))
# Compute ARHR
print("\nARHR (Average Reciprocal Hit Rank):",
RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions))
print("\nComputing complete recommendations,no hold outs...")
algo.fit(fullTrainSet)
bigTestSet = fullTrainSet.build_anti_testset()
allPredictions = algo.test(bigTestSet)
topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n=10)
# Print user coverage with a minimum predicted rating of 4.0:
print("\nUser coverage:",
RecommenderMetrics.UserCoverage(topNPredicted, fullTrainSet.n_users, ratingThreshold=4.0))
# Measure diversity of recommendations:
print("\nDiversity:", RecommenderMetrics.Diversity(topNPredicted, simsAlgo))
# Measure novelty (average popularity rank of recommendations):
print("\nNovelty (average popularity rank):", RecommenderMetrics.Novelty(topNPredicted, rankings))
009 [Activity] Measure the Performance of SVD Recommendations
04 - A Recommender Engine Framework
001 Our Recommender Engine Architecture
002 [Activity] Recommender Engine Walkthrough, Part 1
# Evaluator.py
from EvaluationData import EvaluationData
from EvaluatedAlgorithm import EvaluatedAlgorithm
class Evaluator:
algorithms = []
def __init__(self, dataset, rankings):
ed = EvaluationData(dataset, rankings)
self.dataset = ed
def AddAlgorithm(self, algorithm, name):
alg = EvaluatedAlgorithm(algorithm, name)
self.algorithms.append(alg)
def Evaluate(self, doTopN):
results = {}
for algorithm in self.algorithms:
print("Evaluating ", algorithm.GetName(), "...")
results[algorithm.GetName()] = algorithm.Evaluate(self.dataset, doTopN)
# Print results
print("\n")
if (doTopN):
print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<10}".format(
"Algorithm", "RMSE", "MAE", "HR", "CHR", "ARHR", "Coverage", "Diversity", "Novelty"))
for (name, metrics) in results.items():
print("{:<10} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f} {:<10.4f}".format(
name, metrics["RMSE"], metrics["MAE"], metrics["HR"], metrics["CHR"], metrics["ARHR"],
metrics["Coverage"], metrics["Diversity"], metrics["Novelty"]))
else:
print("{:<10} {:<10} {:<10}".format("Algorithm", "RMSE", "MAE"))
for (name, metrics) in results.items():
print("{:<10} {:<10.4f} {:<10.4f}".format(name, metrics["RMSE"], metrics["MAE"]))
print("\nLegend:\n")
print("RMSE: Root Mean Squared Error.Lower values mean better accuracy.")
print("MAE: Mean Absolute Error.Lower values mean better accuracy.")
if (doTopN):
print("HR: Hit Rate;how often we are able to recommend a left-out rating.Higher is better.")
print(
"cHR: Cumulative Hit Rate;hit rate,confined to ratings above a certain threshold.Higher is better.")
print(
"ARHR: Average Reciprocal Hit Rank-Hit rate that takes the ranking into account.Higher is better.")
print("Coverage:Ratio of users for whom recommendations above a certain threshold exist.Higher is better.")
print(
"Diversity:1-S,where s is the average similarity score between every possible pair of recommendations")
print(" for a given user.Higher means more diverse.")
print("Novelty: Average popularity rank of recommended items.Higher means more novel.")
def SampleTopNRecs(self, ml, testSubject=85, k=10):
for algo in self.algorithms:
print("\nUsing recommender ", algo.GetName())
print("\nBuilding recommendation model...")
trainSet = self.dataset.GetFullTrainSet()
algo.GetAlgorithm().fit(trainSet)
print("Computing recommendations...")
# 用户尚未评分的电影集 AntiTestSet
testSet = self.dataset.GetAntiTestSetForUser(testSubject)
# 对未看过的电影进行评分预测
predictions = algo.GetAlgorithm().test(testSet)
recommendations = []
print("\nWe recommend:")
for userID, movieID, actualRating, estimatedRating, _ in predictions:
intMovieID = int(movieID)
recommendations.append((intMovieID, estimatedRating))
recommendations.sort(key=lambda x: x[1], reverse=True)
for ratings in recommendations[:10]:
print(ml.getMovieName(ratings[0]), ratings[1])
# RecsBakeOff.py
# -*- coding: utf-8 -*-
from MovieLens import MovieLens
from surprise import SVD
from surprise import NormalPredictor
from Evaluator import Evaluator
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(evaluationData, rankings) = LoadMovieLensData()
# Construct an EvaLuator to,you know,evaluate them
evaluator = Evaluator(evaluationData, rankings)
# Throw in an SVD recommender
SVDAlgorithm = SVD(random_state=10)
evaluator.AddAlgorithm(SVDAlgorithm, "SVD")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(True)
003 [Activity] Recommender Engine Walkthrough, Part 2
# EvaluatedAlgorithm.py
from RecommenderMetrics import RecommenderMetrics
from EvaluationData import EvaluationData
class EvaluatedAlgorithm:
def __init__(self, algorithm, name):
self.algorithm = algorithm
self.name = name
def Evaluate(self, evaluationData, doTopN, n=10, verbose=True):
metrics = {}
# Compute accuracy
if (verbose):
print("Evaluating accuracy...")
self.algorithm.fit(evaluationData.GetTrainSet())
predictions = self.algorithm.test(evaluationData.GetTestSet())
metrics["RMSE"] = RecommenderMetrics.RMSE(predictions)
metrics["MAE"] = RecommenderMetrics.MAE(predictions)
if (doTopN):
# Evaluate top-10 with Leave One Out testing
if (verbose):
print("Evaluating top-N with leave-one-out...")
self.algorithm.fit(evaluationData.GetLOOCVTrainSet())
leftOutPredictions = self.algorithm.test(evaluationData.GetLOOCVTestSet())
# Build predictions for all ratings not in the training set
allPredictions = self.algorithm.test(evaluationData.GetLOOCVAntiTestSet())
# Compute top 10 recs for each user
topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
if (verbose):
print("Computing hit-rate and rank metrics...")
# See how often we recommended a movie the user actually rated
metrics["HR"] = RecommenderMetrics.HitRate(topNPredicted, leftOutPredictions)
# See how often we recommended a movie the user actually Liked
metrics["CHR"] = RecommenderMetrics.CumulativeHitRate(topNPredicted, leftOutPredictions)
# Compute ARHR
metrics["ARHR"] = RecommenderMetrics.AverageReciprocalHitRank(topNPredicted, leftOutPredictions)
# Evaluate properties of recommendations on full training set
if (verbose):
print("Computing recommendations with full data set...")
self.algorithm.fit(evaluationData.GetFullTrainSet())
allPredictions = self.algorithm.test(evaluationData.GetFullAntiTestSet())
topNPredicted = RecommenderMetrics.GetTopN(allPredictions, n)
if (verbose):
print("Analyzing coverage,diversity,and novelty...")
# Print user coverage with a minimum predicted rating of 4.0:
metrics["Coverage"] = RecommenderMetrics.UserCoverage(topNPredicted,
evaluationData.GetFullTrainSet().n_users,
ratingThreshold=4.0)
# Measure diversity of recommendations:
metrics["Diversity"] = RecommenderMetrics.Diversity(topNPredicted, evaluationData.GetSimilarities())
# Measure novelty (average popularity rank of recommendations):
metrics["Novelty"] = RecommenderMetrics.Novelty(topNPredicted,
evaluationData.GetPopularityRankings())
if (verbose):
print("Analysis complete.")
return metrics
def GetName(self):
return self.name
def GetAlgorithm(self):
return self.algorithm
# EvaluationData.py
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from surprise import KNNBaseline
class EvaluationData:
def __init__(self, data, popularityRankings):
self.rankings = popularityRankings
# Build a full training set for evaluating overall properties
self.fullTrainSet = data.build_full_trainset()
self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
# Build a 75/25 train/test split for measuring accuracy
self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)
# Build a "Leave one out"train/test split for evaluating top-N recommenders
# And build an anti-test-set for building predictions
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
for train, test in LOOCV.split(data):
self.LOOCVTrain = train
self.LOOCVTest = test
self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
# Compute similarty matrix between items so we can measure diversity
sim_options = {'name': 'cosine', 'user_based': False}
# KNN 计算相似度矩阵
self.simsAlgo = KNNBaseline(sim_options=sim_options)
self.simsAlgo.fit(self.fullTrainSet)
def GetFullTrainSet(self):
return self.fullTrainSet
def GetFullAntiTestset(self):
return self.fullAntiTestSet
def GetAntiTestSetForUser(self, testSubject):
trainset = self.fullTrainSet
fill = trainset.global_mean
anti_testset = []
u = trainset.to_inner_uid(str(testSubject))
user_items = set([j for (j, _) in trainset.ur[u]])
anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
i in trainset.all_items() if
i not in user_items]
return anti_testset
def GetTrainSet(self):
return self.trainSet
def GetTestSet(self):
return self.testSet
def GetLOOCVTrainSet(self):
return self.LOOCVTrain
def GetLOOCVTestSet(self):
return self.LOOCVTest
def GetLOOCVAntiTestSet(self):
return self.LOOCVAntiTestSet
def GetFullAntiTestSet(self):
return self.fullAntiTestSet
def GetSimilarities(self):
return self.
def GetPopularityRankings(self):
return self.rankings
004 [Activity] Review the Results of our Algorithm Evaluation
05 - Content-Based Filtering
001 Content-Based Recommendations, and the Cosine Similarity Metric
002 K-Nearest-Neighbors and Content Recs
003 [Activity] Producing and Evaluating Content-Based Movie Recommendations
# ContentKNNAlgorithm.py
from surprise import AlgoBase
from surprise import PredictionImpossible
from MovieLens import MovieLens
import math
import numpy as np
import heapq
class ContentKNNAlgorithm(AlgoBase):
def __init__(self, k=40, sim_options={}):
AlgoBase.__init__(self)
self.k = k
def fit(self, trainset):
AlgoBase.fit(self, trainset)
# Compute item similarity matrix based on content attributes
# Load up genre vectors for every movie
ml = MovieLens()
genres = ml.getGenres()
years = ml.getYears()
mes = ml.getMiseEnScene()
print("Computing content-based similarity matrix...")
# Compute genre distance for every movie combination as a 2x2 matrix
self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
for thisRating in range(self.trainset.n_items):
if (thisRating % 100 == 0):
print(thisRating, " of ", self.trainset.n_items)
for otherRating in range(thisRating + 1, self.trainset.n_items):
thisMovieID = int(self.trainset.to_raw_iid(thisRating))
otherMovieID = int(self.trainset.to_raw_iid(otherRating))
genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
yearSimilarity = self.computeYearSimilarity(thisMovieID, otherMovieID, years)
# mesSimilarity self.computeMiseEnScenesimilarity(thisMovieID,otherMovieID,mes)
self.similarities[thisRating, otherRating] = genreSimilarity * yearSimilarity
self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
print("...done.")
return self
# 标签相似度
def computeGenreSimilarity(self, moviel, movie2, genres):
genres1 = genres[moviel]
genres2 = genres[movie2]
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(genres1)):
x = genres1[i]
y = genres2[i]
sumxx += x * x
sumyy += y * y
sumxy += x * y
return sumxy / math.sqrt(sumxx * sumyy)
# 上映年份相似度
def computeYearSimilarity(self, movie1, movie2, years):
diff = abs(years[movie1] - years[movie2])
sim = math.exp(-diff / 10.0)
return sim
def computeMiseEnSceneSimilarity(self, movie1, movie2, mes):
mes1 = mes[movie1]
mes2 = mes[movie2]
if (mes1 and mes2):
shotLengthDiff = math.fabs(mes1[0] - mes2[0])
colorVarianceDiff = math.fabs(mes1[1] - mes2[1])
motionDiff = math.fabs(mes1[3] - mes2[3])
lightingDiff = math.fabs(mes1[5] - mes2[5])
numShotsDiff = math.fabs(mes1[6] - mes2[6])
return shotLengthDiff * colorVarianceDiff * motionDiff * lightingDiff * numShotsDiff
else:
return 0
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
# Build up similarity scores between this item and everything the user rated
neighbors = []
for rating in self.trainset.ur[u]:
genreSimilarity = self.similarities[i, rating[0]]
neighbors.append((genreSimilarity, rating[1]))
# Extract the top-K most-sumilar ratings
# nlargest: Find the n largest elements in a dataset.
# Equivalent to: sorted(iterable, key=key, reverse=True)[:n]
k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
# Compute average sim score of K neighbors weighted by user ratings
simTotal = weightedSum = 0
for (simScore, rating) in k_neighbors:
if (simScore > 0):
simTotal += simScore
weightedSum += simScore * rating
if (simTotal == 0):
raise PredictionImpossible('No neighbors')
predictedRating = weightedSum / simTotal
return predictedRating
# ContentRecs.py
from MovieLens import MovieLens
from ContentKNNAlgorithm import ContentKNNAlgorithm
from Evaluator import Evaluator
from surprise import NormalPredictor
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
# Construct an Evaluator to,you know,evaluate them
evaluator = Evaluator(evaluationData, rankings)
contentKNN = ContentKNNAlgorithm()
evaluator.AddAlgorithm(contentKNN, "ContentKNN")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
evaluator.Evaluate(False)
evaluator.SampleTopNRecs(ml)
004 A Note on Using Implicit Ratings
005 [Activity] Bleeding Edge Alert! Mise en Scene Recommendations
006 [Exercise] Dive Deeper into Content-Based Recommendations
06 - Neighborhood-Based Collaborative Filtering
001 Measuring Similarity, and Sparsity
协同过滤, 基于他人的评价来推荐
002 Similarity Metrics
003 User-based Collaborative Filtering
004 [Activity] User-based Collaborative Filtering, Hands-On
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 10:10:04 2018
@author: Frank
"""
# SimpleUserCF.py
from MovieLens import MovieLens
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter
testSubject = '85'
k = 10
# Load our data set and compute the user similarity matrix
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()
trainSet = data.build_full_trainset()
sim_options = {'name': 'cosine',
'user_based': True
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()
# Get top N similar users to our test subject
# (Alternate approach would be to select users up to some similarity threshold - try it!)
testUserInnerID = trainSet.to_inner_uid(testSubject)
similarityRow = simsMatrix[testUserInnerID] # 用户85的相似度矩阵
similarUsers = [] # 记录相似的用户
for innerID, score in enumerate(similarityRow):
if (innerID != testUserInnerID):
similarUsers.append((innerID, score))
# 找到前k个最相似的user
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
innerID = similarUser[0] # simi user id
userSimilarityScore = similarUser[1] # simi user score 和85号的相似分值
theirRatings = trainSet.ur[innerID] # simi user rating
for rating in theirRatings:
# movie: simi user score * rating/5
candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
watched[itemID] = 1 # 记录85号已经看过的movie, 没看过的为 movieID: 0
# Get top-rated items from similar users:
pos = 0
# 根据 candidates候选字典的 [1](score) 排序, 然后遍历
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
# 85号没看过
if not itemID in watched:
movieID = trainSet.to_raw_iid(itemID)
print(ml.getMovieName(int(movieID)), ratingSum)
pos += 1
if (pos > 10):
break
005 Item-based Collaborative Filtering
基于物品的协同过滤(新用户也能很快被推荐适合的, 如果是基于用户, 则需要新用户产生消费或评价数据才能更好地推荐)
006 [Activity] Item-based Collaborative Filtering, Hands-On
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 10:10:04 2018
@author: Frank
"""
# SimpleItemCF.py 基于物品
from MovieLens import MovieLens
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter
testSubject = '85'
k = 10
ml = MovieLens()
data = ml.loadMovieLensLatestSmall()
trainSet = data.build_full_trainset()
sim_options = {'name': 'cosine',
'user_based': False
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()
testUserInnerID = trainSet.to_inner_uid(testSubject)
# Get the top K items we rated
testUserRatings = trainSet.ur[testUserInnerID] # 得到85号的评分数据
# 找到k个评分最高的
kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1])
# Get similar items to stuff we liked (weighted by rating)
candidates = defaultdict(float)
for itemID, rating in kNeighbors:
similarityRow = simsMatrix[itemID] # 找到和被评分物品相似的
for innerID, score in enumerate(similarityRow):
# 记录新物品的评分
candidates[innerID] += score * (rating / 5.0)
# Build a dictionary of stuff the user has already seen
watched = {} # 记录85号看过的
for itemID, rating in trainSet.ur[testUserInnerID]:
watched[itemID] = 1
# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
if not itemID in watched:
movieID = trainSet.to_raw_iid(itemID)
print(ml.getMovieName(int(movieID)), ratingSum)
pos += 1
if (pos > 10):
break
007 [Exercise] Tuning Collaborative Filtering Algorithms
008 [Activity] Evaluating Collaborative Filtering Systems Offline
# -*- coding: utf-8 -*-
"""
Created on Wed May 9 10:10:04 2018
@author: Frank
"""
# EvaluateUserCF.py
from MovieLens import MovieLens
from surprise import KNNBasic
import heapq
from collections import defaultdict
from operator import itemgetter
from surprise.model_selection import LeaveOneOut
from RecommenderMetrics import RecommenderMetrics
from EvaluationData import EvaluationData
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
ml, data, rankings = LoadMovieLensData()
evalData = EvaluationData(data, rankings)
# Train on leave-One-Out train set
trainSet = evalData.GetLOOCVTrainSet()
sim_options = {'name': 'cosine',
'user_based': True}
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()
leftOutTestSet = evalData.GetLOOCVTestSet()
# Build up dict to lists of (int(movieID), predictedrating) pairs
topN = defaultdict(list)
k = 10
for uiid in range(trainSet.n_users):
# Get top N similar users to this one
similarityRow = simsMatrix[uiid]
similarUsers = []
for innerID, score in enumerate(similarityRow):
if (innerID != uiid):
similarUsers.append((innerID, score))
kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])
# Get the stuff they rated, and add up ratings for each item, weighted by user similarity
candidates = defaultdict(float)
for similarUser in kNeighbors:
innerID = similarUser[0]
userSimilarityScore = similarUser[1]
theirRatings = trainSet.ur[innerID]
for rating in theirRatings:
candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore
# Build a dictionary of stuff the user has already seen
watched = {}
for itemID, rating in trainSet.ur[uiid]:
watched[itemID] = 1
# Get top-rated items from similar users:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
if not itemID in watched:
movieID = trainSet.to_raw_iid(itemID)
topN[int(trainSet.to_raw_uid(uiid))].append((int(movieID), 0.0))
pos += 1
if (pos > 40):
break
# Measure
print("HR", RecommenderMetrics.HitRate(topN, leftOutTestSet))
009 [Exercise] Measure the Hit Rate of Item-Based Collaborative Filtering
010 KNN Recommenders
011 [Activity] Running User and Item-Based KNN on MovieLens
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# KNNBakeOff.py
from MovieLens import MovieLens
from surprise import KNNBasic
from surprise import NormalPredictor
from Evaluator import Evaluator
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
# User-based KNN
UserKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': True})
evaluator.AddAlgorithm(UserKNN, "User KNN")
# Item-based KNN
ItemKNN = KNNBasic(sim_options = {'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN, "Item KNN")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(False)
evaluator.SampleTopNRecs(ml)
012 [Exercise] Experiment with different KNN parameters
KNN表现通常不佳, 评级并不是连续的, 而KNN假定数据是连续的, 而且对稀疏数据敏感, KNN适合解决分类问题而不是评级预测问题
013 Bleeding Edge Alert! Translation-Based Recommendations
07 - Matrix Factorization Methods 矩阵分解方法
001 Principal Component Analysis (PCA) 主成分分析
协同过滤可扩展性有限, 大数据集可能要经过大量计算
002 Singular Value Decomposition
各种方法来学习最好的缺失值填充
003 [Activity] Running SVD and SVD++ on MovieLens
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# SVDBakeOff.py
from MovieLens import MovieLens
from surprise import SVD, SVDpp
from surprise import NormalPredictor
from Evaluator import Evaluator
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
# SVD
SVD = SVD()
evaluator.AddAlgorithm(SVD, "SVD")
# SVD++
SVDPlusPlus = SVDpp()
evaluator.AddAlgorithm(SVDPlusPlus, "SVD++")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(False)
evaluator.SampleTopNRecs(ml)
004 Improving on SVD
005 [Exercise] Tune the hyperparameters on SVD
网格搜索寻找最佳超参数
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# SVDTuning.py
from MovieLens import MovieLens
from surprise import SVD
from surprise import NormalPredictor
from Evaluator import Evaluator
from surprise.model_selection import GridSearchCV
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
print("Searching for best parameters...")
param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010],
'n_factors': [50, 100]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(evaluationData)
# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
params = gs.best_params['rmse']
SVDtuned = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors'])
evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned")
SVDUntuned = SVD()
evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(False)
evaluator.SampleTopNRecs(ml)
006 Bleeding Edge Alert! Sparse Linear Methods (SLIM)
09 - Deep Learning for Recommender Systems
001 Intro to Deep Learning for Recommenders
002 Restricted Boltzmann Machines (RBM’s)vb 受限波兹曼机
003 [Activity] Recommendations with RBM’s, part 1
# RBM.py
import numpy as np
import tensorflow as tf # 1.15 py36
from tensorflow.python.framework import ops
class RBM(object):
def __init__(self, visibleDimensions, epochs=20, hiddenDimensions=50, ratingValues=10, learningRate=0.001,
batchSize=100):
# 输入层维度
self.visibleDimensions = visibleDimensions
self.epochs = epochs
# 隐藏层维度 - 隐藏层神经元的数量
self.hiddenDimensions = hiddenDimensions
self.ratingValues = ratingValues
self.learningRate = learningRate
self.batchSize = batchSize
def Train(self, X):
ops.reset_default_graph()
self.MakeGraph()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
for epoch in range(self.epochs):
np.random.shuffle(X)
trX = np.array(X)
# 0~trX.shape[0], 每次迭代 i+batchSize
for i in range(0, trX.shape[0], self.batchSize):
self.sess.run(self.update, feed_dict={self.X: trX[i:i + self.batchSize]})
print("Trained epoch ", epoch)
def GetRecommendations(self, inputUser):
hidden = tf.nn.sigmoid(tf.matmul(self.X, self.weights) + self.hiddenBias)
visible = tf.nn.sigmoid(tf.matmul(hidden, tf.transpose(self.weights)) + self.visibleBias)
feed = self.sess.run(hidden, feed_dict={self.X: inputUser})
rec = self.sess.run(visible, feed_dict={hidden: feed})
return rec[0]
def MakeGraph(self):
tf.set_random_seed(0)
# Create variables for the graph, weights and biases
self.X = tf.placeholder(tf.float32, [None, self.visibleDimensions], name="X")
# Initialize weights randomly
maxWeight = -4.0 * np.sqrt(6.0 / (self.hiddenDimensions + self.visibleDimensions))
self.weights = tf.Variable(
tf.random_uniform([self.visibleDimensions, self.hiddenDimensions], minval=-maxWeight, maxval=maxWeight),
tf.float32, name="weights")
self.hiddenBias = tf.Variable(tf.zeros([self.hiddenDimensions], tf.float32, name="hiddenBias"))
self.visibleBias = tf.Variable(tf.zeros([self.visibleDimensions], tf.float32, name="visibleBias"))
# Perform Gibbs Sampling for Contrastive Divergence, per the paper we assume k=1 instead of iterating over the
# forward pass multiple times since it seems to work just fine
# Forward pass
# Sample hidden layer given visible...
# Get tensor of hidden probabilities
hProb0 = tf.nn.sigmoid(tf.matmul(self.X, self.weights) + self.hiddenBias)
# Sample from all of the distributions
hSample = tf.nn.relu(tf.sign(hProb0 - tf.random_uniform(tf.shape(hProb0))))
# Stitch it together
forward = tf.matmul(tf.transpose(self.X), hSample) # tf.matmul 矩阵相乘
# Backward pass
# Reconstruct visible layer given hidden layer sample
v = tf.matmul(hSample, tf.transpose(self.weights)) + self.visibleBias
# Build up our mask for missing ratings
vMask = tf.sign(self.X) # Make sure everything is 0 or 1
vMask3D = tf.reshape(vMask,
[tf.shape(v)[0], -1, self.ratingValues]) # Reshape into arrays of individual ratings
vMask3D = tf.reduce_max(vMask3D, axis=[2],
keepdims=True) # Use reduce_max to either give us 1 for ratings that exist, and 0 for missing ratings
# Extract rating vectors for each individual set of 10 rating binary values
v = tf.reshape(v, [tf.shape(v)[0], -1, self.ratingValues])
vProb = tf.nn.softmax(v * vMask3D) # Apply softmax activation function
vProb = tf.reshape(vProb, [tf.shape(v)[0],
-1]) # And shove them back into the flattened state. Reconstruction is done now.
# Stitch it together to define the backward pass and updated hidden biases
hProb1 = tf.nn.sigmoid(tf.matmul(vProb, self.weights) + self.hiddenBias)
backward = tf.matmul(tf.transpose(vProb), hProb1)
# Now define what each epoch will do...
# Run the forward and backward passes, and update the weights
weightUpdate = self.weights.assign_add(self.learningRate * (forward - backward))
# Update hidden bias, minimizing the divergence in the hidden nodes
hiddenBiasUpdate = self.hiddenBias.assign_add(self.learningRate * tf.reduce_mean(hProb0 - hProb1, 0))
# Update the visible bias, minimizng divergence in the visible results
visibleBiasUpdate = self.visibleBias.assign_add(self.learningRate * tf.reduce_mean(self.X - vProb, 0))
self.update = [weightUpdate, hiddenBiasUpdate, visibleBiasUpdate]
004 [Activity] Recommendations with RBM’s, part 2
# -*- coding: utf-8 -*-
"""
Created on Fri May 4 13:08:25 2018
@author: Frank
"""
# RBMAlgorithm.py
from surprise import AlgoBase
from surprise import PredictionImpossible
import numpy as np
from RBM import RBM
class RBMAlgorithm(AlgoBase):
def __init__(self, epochs=20, hiddenDim=100, learningRate=0.001, batchSize=100, sim_options={}):
AlgoBase.__init__(self)
self.epochs = epochs
self.hiddenDim = hiddenDim
self.learningRate = learningRate
self.batchSize = batchSize
def softmax(self, x):
return np.exp(x) / np.sum(np.exp(x), axis=0)
def fit(self, trainset):
AlgoBase.fit(self, trainset)
numUsers = trainset.n_users
numItems = trainset.n_items
trainingMatrix = np.zeros([numUsers, numItems, 10], dtype=np.float32)
for (uid, iid, rating) in trainset.all_ratings():
adjustedRating = int(float(rating) * 2.0) - 1
trainingMatrix[int(uid), int(iid), adjustedRating] = 1
# Flatten to a 2D array, with nodes for each possible rating type on each possible item, for every user.
trainingMatrix = np.reshape(trainingMatrix, [trainingMatrix.shape[0], -1])
# Create an RBM with (num items * rating values) visible nodes
rbm = RBM(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate,
batchSize=self.batchSize, epochs=self.epochs)
rbm.Train(trainingMatrix)
self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
for uiid in range(trainset.n_users):
if (uiid % 50 == 0):
print("Processing user ", uiid)
recs = rbm.GetRecommendations([trainingMatrix[uiid]])
recs = np.reshape(recs, [numItems, 10])
for itemID, rec in enumerate(recs):
# The obvious thing would be to just take the rating with the highest score:
# rating = rec.argmax()
# ... but this just leads to a huge multi-way tie for 5-star predictions.
# The paper suggests performing normalization over K values to get probabilities
# and take the expectation as your prediction, so we'll do that instead:
normalized = self.softmax(rec)
rating = np.average(np.arange(10), weights=normalized)
self.predictedRatings[uiid, itemID] = (rating + 1) * 0.5
return self
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
rating = self.predictedRatings[u, i]
if (rating < 0.001):
raise PredictionImpossible('No valid prediction exists.')
return rating
005 [Activity] Evaluating the RBM Recommender
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# RBMBakeOff.py
from MovieLens import MovieLens
from RBMAlgorithm import RBMAlgorithm
from surprise import NormalPredictor
from Evaluator import Evaluator
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
#RBM
RBM = RBMAlgorithm(epochs=20)
evaluator.AddAlgorithm(RBM, "RBM")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(True)
evaluator.SampleTopNRecs(ml)
006 [Exercise] Tuning Restricted Boltzmann Machines
007 Exercise Results Tuning a RBM Recommender
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# RBMTuning.py
from MovieLens import MovieLens
from RBMAlgorithm import RBMAlgorithm
from surprise import NormalPredictor
from Evaluator import Evaluator
from surprise.model_selection import GridSearchCV
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
print("Searching for best parameters...")
param_grid = {'hiddenDim': [20, 10], 'learningRate': [0.1, 0.01]}
gs = GridSearchCV(RBMAlgorithm, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(evaluationData)
# best RMSE score
print("Best RMSE score attained: ", gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
params = gs.best_params['rmse']
RBMtuned = RBMAlgorithm(hiddenDim = params['hiddenDim'], learningRate = params['learningRate'])
evaluator.AddAlgorithm(RBMtuned, "RBM - Tuned")
RBMUntuned = RBMAlgorithm()
evaluator.AddAlgorithm(RBMUntuned, "RBM - Untuned")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(False)
evaluator.SampleTopNRecs(ml)
008 Auto-Encoders for Recommendations Deep Learning for Recs
009 [Activity] Recommendations with Deep Neural Networks
数据太稀疏, 评分样本不够, 则缺失值或0值过多, 神经网络的效果不好
# AutoRec.py
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
class AutoRec(object):
def __init__(self, visibleDimensions, epochs=200, hiddenDimensions=50, learningRate=0.1, batchSize=100):
self.visibleDimensions = visibleDimensions
self.epochs = epochs
self.hiddenDimensions = hiddenDimensions
self.learningRate = learningRate
self.batchSize = batchSize
def Train(self, X):
ops.reset_default_graph()
self.MakeGraph()
init = tf.global_variables_initializer()
self.sess = tf.Session()
self.sess.run(init)
npX = np.array(X)
for epoch in range(self.epochs):
for i in range(0, npX.shape[0], self.batchSize):
epochX = npX[i:i + self.batchSize]
self.sess.run(self.update, feed_dict={self.inputLayer: epochX})
print("Trained epoch ", epoch)
def GetRecommendations(self, inputUser):
# Feed through a single user and return predictions from the output layer.
rec = self.sess.run(self.outputLayer, feed_dict={self.inputLayer: inputUser})
return rec[0]
def MakeGraph(self):
tf.set_random_seed(0)
# Create varaibles for weights for the encoding (visible->hidden) and decoding (hidden->output) stages, randomly initialized
self.encoderWeights = {
'weights': tf.Variable(tf.random_normal([self.visibleDimensions, self.hiddenDimensions]))}
self.decoderWeights = {
'weights': tf.Variable(tf.random_normal([self.hiddenDimensions, self.visibleDimensions]))}
# Create biases
self.encoderBiases = {'biases': tf.Variable(tf.random_normal([self.hiddenDimensions]))}
self.decoderBiases = {'biases': tf.Variable(tf.random_normal([self.visibleDimensions]))}
# Create the input layer
self.inputLayer = tf.placeholder('float', [None, self.visibleDimensions])
# hidden layer
hidden = tf.nn.sigmoid(
tf.add(tf.matmul(self.inputLayer, self.encoderWeights['weights']), self.encoderBiases['biases']))
# output layer for our predictions.
self.outputLayer = tf.nn.sigmoid(
tf.add(tf.matmul(hidden, self.decoderWeights['weights']), self.decoderBiases['biases']))
# Our "true" labels for training are copied from the input layer.
self.labels = self.inputLayer
# loss function and optimizer. Try other optimizers, like Adam!
loss = tf.losses.mean_squared_error(self.labels, self.outputLayer)
optimizer = tf.train.RMSPropOptimizer(self.learningRate).minimize(loss)
# What we evaluate each batch.
self.update = [optimizer, loss]
# -*- coding: utf-8 -*-
"""
Created on Fri May 4 13:08:25 2018
@author: Frank
"""
# AutoRecAlgorithm.py
from surprise import AlgoBase
from surprise import PredictionImpossible
import numpy as np
from AutoRec import AutoRec
class AutoRecAlgorithm(AlgoBase):
def __init__(self, epochs=100, hiddenDim=100, learningRate=0.01, batchSize=100, sim_options={}):
AlgoBase.__init__(self)
self.epochs = epochs
self.hiddenDim = hiddenDim
self.learningRate = learningRate
self.batchSize = batchSize
def fit(self, trainset):
AlgoBase.fit(self, trainset)
numUsers = trainset.n_users
numItems = trainset.n_items
trainingMatrix = np.zeros([numUsers, numItems], dtype=np.float32)
for (uid, iid, rating) in trainset.all_ratings():
trainingMatrix[int(uid), int(iid)] = rating / 5.0
# Create an RBM with (num items * rating values) visible nodes
autoRec = AutoRec(trainingMatrix.shape[1], hiddenDimensions=self.hiddenDim, learningRate=self.learningRate,
batchSize=self.batchSize, epochs=self.epochs)
autoRec.Train(trainingMatrix)
self.predictedRatings = np.zeros([numUsers, numItems], dtype=np.float32)
for uiid in range(trainset.n_users):
if (uiid % 50 == 0):
print("Processing user ", uiid)
recs = autoRec.GetRecommendations([trainingMatrix[uiid]])
for itemID, rec in enumerate(recs):
self.predictedRatings[uiid, itemID] = rec * 5.0
return self
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
rating = self.predictedRatings[u, i]
if (rating < 0.001):
raise PredictionImpossible('No valid prediction exists.')
return rating
# -*- coding: utf-8 -*-
"""
Created on Thu May 3 11:11:13 2018
@author: Frank
"""
# AutoRecBakeOff.py
from MovieLens import MovieLens
from AutoRecAlgorithm import AutoRecAlgorithm
from surprise import NormalPredictor
from Evaluator import Evaluator
import random
import numpy as np
def LoadMovieLensData():
ml = MovieLens()
print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()
return (ml, data, rankings)
np.random.seed(0)
random.seed(0)
# Load up common data set for the recommender algorithms
(ml, evaluationData, rankings) = LoadMovieLensData()
# Construct an Evaluator to, you know, evaluate them
evaluator = Evaluator(evaluationData, rankings)
#Autoencoder
AutoRec = AutoRecAlgorithm()
evaluator.AddAlgorithm(AutoRec, "AutoRec")
# Just make random recommendations
Random = NormalPredictor()
evaluator.AddAlgorithm(Random, "Random")
# Fight!
evaluator.Evaluate(True)
evaluator.SampleTopNRecs(ml)
010 Clickstream Recommendations with RNN’s
基于会话的推荐, 根据用户的点击流, 用RNN预测用户接下来会点击什么
011 [Exercise] Get GRU4Rec Working on your Desktop
012 Exercise Results GRU4Rec in Action
… skip
013 Bleeding Edge Alert! Generative Adversarial Networks for Recommendations
014 Tensorflow Recommenders (TFRS) Intro, and Building a Retrieval Stage
pip install -q tensorflow-recommenders
pip install -q --upgrade tensorflow-datasets
#%% md
# Introducing Keras
Be sure to be using tensorflow 1.9 or newer!
Keras is a higher-level API within TensorFlow that makes things a lot easier. Not only is it easier to use, it's easier to tune.
Let's set up the same deep neural network we set up with TensorFlow to learn from the MNIST data set.
First we'll import all the stuff we need, which will initialize Keras as a side effect:
#%%
from tensorflow import keras
from tensorflow.keras.datasets import mnist
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
#%% md
We'll load up the MNIST data set. In Keras, it's a little bit different - there are 60K training samples and 10K test samples. No "validation" samples.
#%%
(mnist_train_images, mnist_train_labels), (mnist_test_images, mnist_test_labels) = mnist.load_data()
#%% md
We need to explicitly convert the data into the format Keras / TensorFlow expects. We divide the image data by 255 in order to normalize it into 0-1 range, after converting it into floating point values.
#%%
train_images = mnist_train_images.reshape(60000, 784)
test_images = mnist_test_images.reshape(10000, 784)
train_images = train_images.astype('float32')
test_images = test_images.astype('float32')
train_images /= 255
test_images /= 255
#%% md
Now we'll convert the 0-9 labels into "one-hot" format, as we did for TensorFlow.
#%%
train_labels = keras.utils.to_categorical(mnist_train_labels, 10)
test_labels = keras.utils.to_categorical(mnist_test_labels, 10)
#%% md
Let's take a peek at one of the training images just to make sure it looks OK:
#%%
%matplotlib inline
import matplotlib.pyplot as plt
def display_sample(num):
#Print the one-hot array of this sample's label
print(train_labels[num])
#Print the label converted back to a number
label = train_labels[num].argmax(axis=0)
#Reshape the 768 values to a 28x28 image
image = train_images[num].reshape([28,28])
plt.title('Sample: %d Label: %d' % (num, label))
plt.imshow(image, cmap=plt.get_cmap('gray_r'))
plt.show()
display_sample(1234)
#%% md
Here's where things get exciting. All that code we wrote in Tensorflow creating placeholders, variables, and defining a bunch of linear algebra for each layer in our neural network? None of that is necessary with Keras!
We can set up the same layers like this. The input layer of 784 features feeds into a ReLU layer of 512 nodes, which then goes into 10 nodes with softmax applied. Couldn't be simpler:
#%%
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dense(10, activation='softmax'))
#%% md
We can even get a nice description of the resulting model:
#%%
model.summary()
#%% md
Setting up our optimizer and loss function is just as simple. We will use the RMSProp optimizer here. Other choices include Adagrad, SGD, Adam, Adamax, and Nadam. See https://keras.io/optimizers/
#%%
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(),
metrics=['accuracy'])
#%% md
Training our model is also just one line of code with Keras. Here we'll do 10 epochs with a batch size of 100. Keras is slower, and if we're not running on top of a GPU-accelerated Tensorflow this can take a fair amount of time (that's why I've limited it to just 10 epochs.)
#%%
history = model.fit(train_images, train_labels,
batch_size=100,
epochs=10,
verbose=2,
validation_data=(test_images, test_labels))
#%% md
But, even with just 10 epochs, we've outperformed our Tensorflow version considerably!
#%%
score = model.evaluate(test_images, test_labels, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
#%% md
As before let's visualize the ones it got wrong. As this model is much better, we'll have to search deeper to find mistakes to look at.
#%%
for x in range(1000):
test_image = test_images[x,:].reshape(1,784)
predicted_cat = model.predict(test_image).argmax()
label = test_labels[x].argmax()
if (predicted_cat != label):
plt.title('Prediction: %d Label: %d' % (predicted_cat, label))
plt.imshow(test_image.reshape([28,28]), cmap=plt.get_cmap('gray_r'))
plt.show()
#%% md
You can see most of the ones it's having trouble with, are images a human would have trouble with as well!
## Excercise
As before, see if you can improve on the results! Does running more epochs help considerably? How about trying different optimizers?
You can also take advantage of Keras's ease of use to try different topologies quickly. Keras includes a MNIST example, where they add an additional layer, and use Dropout at each step to prevent overfitting, like this:
`
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(784,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(10, activation='softmax'))
`
Try adapting that to our code above and see if it makes a difference or not.
#%%
10 - Scaling it Up
004 [Activity] Movie Recommendations with Spark, Matrix Factorization, and ALS
# -*- coding: utf-8 -*-
"""
Created on Mon May 28 11:09:55 2018
@author: Frank
"""
# SparkALS.py
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from MovieLens import MovieLens
# org.apache.spark.SparkException: Python worker failed to connect back.
import os
os.environ['PYSPARK_PYTHON'] = "python"
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("ALSExample") \
.getOrCreate()
lines = spark.read.option("header", "true").csv("../ml-latest-small/ratings.csv").rdd
ratingsRDD = lines.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
userRecs = model.recommendForAllUsers(10)
user85Recs = userRecs.filter(userRecs['userId'] == 85).collect()
spark.stop()
ml = MovieLens()
ml.loadMovieLensLatestSmall()
for row in user85Recs:
for rec in row.recommendations:
print(ml.getMovieName(rec.movieId))