Spark机器学习

准备环境

git clone https://github.com/mikiobraun/jblas.git
cd jblas
mvn install

运行环境

cd /Users/erichan/Garden/spark-1.5.1-bin-cdh4

bin/spark-shell --name my_mlib --packages org.jblas:jblas:1.2.4-SNAPSHOT --driver-memory 4G --executor-memory 4G --driver-cores 2

推荐引擎

1 提取有效特征

val PATH = "/Users/erichan/sourcecode/book/Spark机器学习"
val rawData = sc.textFile(PATH+"/ml-100k/u.data")
rawData.first()

res1: String = 196 242 3 881250949

import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
val rawRatings=rawData.map(_.split("\t").take(3))
val ratings = rawRatings.map { case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
ratings.first()

res2: org.apache.spark.mllib.recommendation.Rating = Rating(196,242,3.0)

2 训练推荐模型

val model = ALS.train(ratings, 50, 10, 0.01) //rank=50, iterations=10, lambda=0.01
model.userFeatures.count

res3: Long = 943

model.productFeatures.count

res4: Long = 1682

3 使用模型

3.1.1 用户推荐
val predictedRating = model.predict(789, 123)

val userId = 789
val K = 10
val topKRecs = model.recommendProducts(userId, K)
println(topKRecs.mkString("\n"))

Rating(789,176,5.732688958436494)
Rating(789,201,5.682340265545152)
Rating(789,182,5.5902224300291214)
Rating(789,183,5.5877871075408585)
Rating(789,96,5.4425266495153455)
Rating(789,76,5.39730369058763)
Rating(789,195,5.356822356978749)
Rating(789,589,5.1464233861748925)
Rating(789,134,5.109287533257644)
Rating(789,518,5.106161562126567)

3.1.2 校验推荐
val movies = sc.textFile(PATH+"/ml-100k/u.item")
val titles = movies.map(line => line.split("\\|").take(2)).map(array => (array(0).toInt, array(1))).collectAsMap()
titles(123)
val moviesForUser = ratings.keyBy(_.user).lookup(789)
println(moviesForUser.size)

33

moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.product), rating.rating)).foreach(println)

(Godfather, The (1972),5.0)
(Trainspotting (1996),5.0)
(Dead Man Walking (1995),5.0)
(Star Wars (1977),5.0)
(Swingers (1996),5.0)
(Leaving Las Vegas (1995),5.0)
(Bound (1996),5.0)
(Fargo (1996),5.0)
(Last Supper, The (1995),5.0)
(Private Parts (1997),4.0)

topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)

(Aliens (1986),5.732688958436494)
(Evil Dead II (1987),5.682340265545152)
(GoodFellas (1990),5.5902224300291214)
(Alien (1979),5.5877871075408585)
(Terminator 2: Judgment Day (1991),5.4425266495153455)
(Carlito's Way (1993),5.39730369058763)
(Terminator, The (1984),5.356822356978749)
(Wild Bunch, The (1969),5.1464233861748925)
(Citizen Kane (1941),5.109287533257644)
(Miller's Crossing (1990),5.106161562126567)

3.2.1 物品推荐
import org.jblas.DoubleMatrix
val aMatrix = new DoubleMatrix(Array(1.0, 2.0, 3.0))
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val itemId = 567
val itemFactor = model.productFeatures.lookup(itemId).head
val itemVector = new DoubleMatrix(itemFactor)
cosineSimilarity(itemVector, itemVector)

res10: Double = 1.0

val sims = model.productFeatures.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim)
}
val sortedSims = sims.top(K)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
println(sortedSims.mkString("\n"))

(567,1.0)
(413,0.7309050775072655)
(895,0.6992030886048359)
(853,0.6960095521899471)
(219,0.6806270119940826)
(302,0.6757242121714326)
(257,0.6721490667554395)
(160,0.6672080746572076)
(563,0.6621573120106216)
(1019,0.6591520069387037)

3.2.2 校验推荐
println(titles(itemId))

Wes Craven's New Nightmare (1994)

val sortedSims2 = sims.top(K + 1)(Ordering.by[(Int, Double), Double] { case (id, similarity) => similarity })
sortedSims2.slice(1, 11).map{ case (id, sim) => (titles(id), sim) }.mkString("\n")

res13: String =
(Tales from the Crypt Presents: Bordello of Blood (1996),0.7309050775072655)
(Scream 2 (1997),0.6992030886048359)
(Braindead (1992),0.6960095521899471)
(Nightmare on Elm Street, A (1984),0.6806270119940826)
(L.A. Confidential (1997),0.6757242121714326)
(Men in Black (1997),0.6721490667554395)
(Glengarry Glen Ross (1992),0.6672080746572076)
(Stephen King's The Langoliers (1995),0.6621573120106216)
(Die xue shuang xiong (Killer, The) (1989),0.6591520069387037)
(Evil Dead II (1987),0.655134288821937)

4 模型效果评估

4.1 均方差(Mean Squared Error,MSE)
val actualRating = moviesForUser.take(1)(0)
val predictedRating = model.predict(789, actualRating.product)
val squaredError = math.pow(predictedRating - actualRating.rating, 2.0)
val usersProducts = ratings.map{ case Rating(user, product, rating) => (user, product)}
val predictions = model.predict(usersProducts).map{
case Rating(user, product, rating) => ((user, product), rating)
}
val ratingsAndPredictions = ratings.map{
case Rating(user, product, rating) => ((user, product), rating)
}.join(predictions)
val MSE = ratingsAndPredictions.map{
case ((user, product), (actual, predicted)) => math.pow((actual - predicted), 2)
}.reduce(_ + _) / ratingsAndPredictions.count
println("Mean Squared Error = " + MSE)

Mean Squared Error = 0.08527363423596633

val RMSE = math.sqrt(MSE)
println("Root Mean Squared Error = " + RMSE)

Root Mean Squared Error = 0.2920164965134099

4.2 K值平均准确率(MAPK)
def avgPrecisionK(actual: Seq[Int], predicted: Seq[Int], k: Int): Double = {
val predK = predicted.take(k)
var score = 0.0
var numHits = 0.0
for ((p, i) <- predK.zipWithIndex) {
if (actual.contains(p)) {
numHits += 1.0
score += numHits / (i.toDouble + 1.0)
}
}
if (actual.isEmpty) {
1.0
} else {
score / scala.math.min(actual.size, k).toDouble
}
}
val actualMovies = moviesForUser.map(_.product)
val predictedMovies = topKRecs.map(_.product)
val apk10 = avgPrecisionK(actualMovies, predictedMovies, 10)
val itemFactors = model.productFeatures.map { case (id, factor) => factor }.collect()
val itemMatrix = new DoubleMatrix(itemFactors)
println(itemMatrix.rows, itemMatrix.columns)

(1682,50)

val imBroadcast = sc.broadcast(itemMatrix)
val allRecs = model.userFeatures.map{ case (userId, array) =>
val userVector = new DoubleMatrix(array)
val scores = imBroadcast.value.mmul(userVector)
val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
(userId, recommendedIds)
}
val userMovies = ratings.map{ case Rating(user, product, rating) => (user, product) }.groupBy(_._1)
val K = 10
val MAPK = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual, predicted, K)
}.reduce(_ + _) / allRecs.count
println("Mean Average Precision at K = " + MAPK)

Mean Average Precision at K = 0.030001472840815356

4.3 MLib内置评估函数·RMSE和MSE
import org.apache.spark.mllib.evaluation.RegressionMetrics
val predictedAndTrue = ratingsAndPredictions.map { case ((user, product), (actual, predicted)) => (actual, predicted) }
val regressionMetrics = new RegressionMetrics(predictedAndTrue)
println("Mean Squared Error = " + regressionMetrics.meanSquaredError)

Mean Squared Error = 0.08527363423596633

println("Root Mean Squared Error = " + regressionMetrics.rootMeanSquaredError)

Root Mean Squared Error = 0.2920164965134099

4.4 MLib内置评估函数·MAP(平均准确率)
import org.apache.spark.mllib.evaluation.RankingMetrics
val predictedAndTrueForRanking = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2)
(predicted.toArray, actual.toArray)
}
val rankingMetrics = new RankingMetrics(predictedAndTrueForRanking)
println("Mean Average Precision = " + rankingMetrics.meanAveragePrecision)

Mean Average Precision = 0.07208991526855565

val MAPK2000 = allRecs.join(userMovies).map{ case (userId, (predicted, actualWithIds)) =>
val actual = actualWithIds.map(_._2).toSeq
avgPrecisionK(actual, predicted, 2000)
}.reduce(_ + _) / allRecs.count
println("Mean Average Precision = " + MAPK2000)

Mean Average Precision = 0.07208991526855561

Spark机器学习3·推荐引擎(spark-shell)的更多相关文章

  1. Spark机器学习之推荐引擎

    一. 最小二乘法建立模型 关于最小二乘法矩阵分解,我们可以参阅: 一.矩阵分解模型. 用户对物品的打分行为可以表示成一个评分矩阵A(m*n),表示m个用户对n各物品的打分情况.如下图所示: 其中,A( ...

  2. 基于Spark ALS构建商品推荐引擎

    基于Spark ALS构建商品推荐引擎   一般来讲,推荐引擎试图对用户与某类物品之间的联系建模,其想法是预测人们可能喜好的物品并通过探索物品之间的联系来辅助这个过程,让用户能更快速.更准确的获得所需 ...

  3. 基于Azure构建PredictionIO和Spark的推荐引擎服务

    基于Azure构建PredictionIO和Spark的推荐引擎服务 1. 在Azure构建Ubuntu 16.04虚拟机 假设前提条件您已有 Azure 帐号,登陆 Azure https://po ...

  4. Azure构建PredictionIO和Spark的推荐引擎服务

    Azure构建PredictionIO和Spark的推荐引擎服务 1. 在Azure构建Ubuntu 16.04虚拟机 假设前提条件您已有 Azure 帐号,登陆 Azure https://port ...

  5. Spark高级数据分析· 3推荐引擎

    推荐算法流程 推荐算法 预备 wget http://www.iro.umontreal.ca/~lisa/datasets/profiledata_06-May-2005.tar.gz cd /Us ...

  6. 数据算法 --hadoop/spark数据处理技巧 --(7.共同好友 8. 使用MR实现推荐引擎)

    七,共同好友. 在所有用户对中找出“共同好友”. eg: a    b,c,d,g b    a,c,d,e map()->  <a,b>,<b,c,d,g> ;< ...

  7. Spark入门实战系列--8.Spark MLlib(下)--机器学习库SparkMLlib实战

    [注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .MLlib实例 1.1 聚类实例 1.1.1 算法说明 聚类(Cluster analys ...

  8. 【转载】协同过滤 & Spark机器学习实战

    因为协同过滤内容比较多,就新开一篇文章啦~~ 聚类和线性回归的实战,可以看:http://www.cnblogs.com/charlesblc/p/6159187.html 协同过滤实战,仍然参考:h ...

  9. 大规模数据分析统一引擎Spark最新版本3.3.0入门实战

    @ 目录 概述 定义 Hadoop与Spark的关系与区别 特点与关键特性 组件 集群概述 集群术语 部署 概述 环境准备 Local模式 Standalone部署 Standalone模式 配置历史 ...

随机推荐

  1. 【BZOJ】1665: [Usaco2006 Open]The Climbing Wall 攀岩(spfa)

    http://www.lydsy.com/JudgeOnline/problem.php?id=1665 这题只要注意到“所有的落脚点至少相距300”就可以大胆的暴力了. 对于每个点,我们枚举比他的x ...

  2. [转]NBehave行为驱动测试关于story和scenarios

    原文: Behavior-Driven Development with NBehave 这里模拟了一个"银行账户"的类 一个余额属性,一个存款方法,一个撤销账户的方法,一个转账的 ...

  3. SQLAllocStmt与SQLFreeStmt

    1.申请语句句柄 SQLAllocStmt函数为应用程序分配语句句柄,其格式为:RETCODE SQLAllocStmt(HDBC hdbc, HSTMT FAR * phstmt) 其中, hdbc ...

  4. Kotlin——初级篇(八):关于字符串(String)常用操作汇总

    在前面讲解Kotlin数据类型的时候,提到了字符串类型,当然关于其定义在前面的章节中已经讲解过了.对Kotlin中的数据类型不清楚的同学.请参考Kotlin--初级篇(三):数据类型详解这篇文章. 在 ...

  5. java.lang.IllegalArgumentException: Failed to decrypt.

    加密失败. 附加信息: org.springframework.transaction.CannotCreateTransactionException: Could not open JDBC Co ...

  6. VMware虚拟机Bridged(桥接模式)

    转载于:https://www.linuxidc.com/Linux/2016-09/135521.htm   vmware为我们提供了三种网络工作模式,它们分别是:Bridged(桥接模式).NAT ...

  7. 使用ShardingJdbc分表

    项目中做个统一订单的基础服务(只记录订单的基本的公共信息),1.便与后续各种其他业务的接入~ 2.同时APP端提供统一订单信息的查询入口,后续其他业务不用升级 由于统一的订单服务,所以订单量会很大,所 ...

  8. vs2008 怎么在Release下调试代码

    vs2008 怎么在Release下调试代码 (适用VS2005/VS2008) 在当前工程点击右键选择properties,选择 All Configurations C++>General- ...

  9. Less-css预处理Node and VS扩展编译

    node编译 第一步:https://nodejs.org/en/  到node官网下载最新的node 第二步:和普通软件一样把node安装好 第三步:运行-cmd,准备安装less 全局安装(整个电 ...

  10. sql---如何把sql查询出来的结果当做另一个sql的条件查询,1、语句2、with as

    '; -- table2 的 name 作为 table1的条件 select * from table1 where name in (select name from table2) --如果有多 ...