Spark高级数据分析· 3推荐引擎

预备

wget http://www.iro.umontreal.ca/~lisa/datasets/profiledata_06-May-2005.tar.gz

cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin

./spark-shell --master local --driver-memory 6g

1 准备数据

val data ="/Users/erichan/AliDrive/ml_spark/data/profiledata_06-May-2005"

val rawUserArtistData = sc.textFile(data+"/user_artist_data.txt",10)

// ALS 需要ID必须为数值型

rawUserArtistData.first

//res3: String = 1092764  1000311

//rawUserArtistData.map(_.split(' ')(0).toDouble).stats()

//res10: org.apache.spark.util.StatCounter = (count: 24296858, mean: 1947573.265353, stdev: 496000.544975, max: 2443548.000000, min: 90.000000)

//rawUserArtistData.map(_.split(' ')(1).toDouble).stats()

//res11: org.apache.spark.util.StatCounter = (count: 24296858, mean: 1718704.093757, stdev: 2539389.040171, max: 10794401.000000, min: 1.000000)

val rawArtistData = sc.textFile(data+"/artist_data.txt")

//rawArtistData.first

//res12: String = 1134999   06Crazy Life

val artistByID = rawArtistData.flatMap { line =>

  val (id, name) = line.span(_ != '\t')

  if (name.isEmpty) {

    None

  }else{

    try {

      Some((id.toInt, name.trim))

    } catch {

      case e: NumberFormatException => None

    }

  }

}

val rawArtistAlias = sc.textFile(data+"/artist_alias.txt")

val artistAlias = rawArtistAlias.flatMap { line =>

  val tokens = line.split('\t')

  if (tokens(0).isEmpty) {

    None

  }else{

    Some((tokens(0).toInt, tokens(1).toInt))

  }

}.collectAsMap()

//artistByID.lookup(1000010).head

//res14: String = Aerosmith

2 建模

import org.apache.spark.mllib.recommendation._

val bArtistAlias = sc.broadcast(artistAlias)

val trainData = rawUserArtistData.map { line =>

  val Array(userID, artistID, count) = line.split(' ').map(_.toInt)

  val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)

  Rating(userID, finalArtistID, count)

}.cache()

val model = ALS.trainImplicit(trainData, 10, 5, 0.01, 1.0)

3 检验

val rawArtistsForUser = rawUserArtistData.map(_.split(' ')).filter {

  case Array(user,_,_) => user.toInt == 2093760

}

val existingProducts = rawArtistsForUser.map {

  case Array(_,artist,_) => artist.toInt

}.collect().toSet

artistByID.filter {

  case (id, name) => existingProducts.contains(id)

}.values.collect().foreach(println)

val recommendations = model.recommendProducts(2093760, 5)

recommendations.foreach(println)

val recommendedProductIDs = recommendations.map(_.product).toSet

artistByID.filter {

  case (id, name) => recommendedProductIDs.contains(id)

}.values.collect().foreach(println)

4 评价

:load /Users/erichan/sourcecode/book/aas/ch03-recommender/src/main/scala/RunAUC.scala

val bArtistAlias = sc.broadcast(RunAUC.buildArtistAlias(rawArtistAlias))

val allData = RunAUC.buildRatings(rawUserArtistData, bArtistAlias)

val Array(trainData, cvData) = allData.randomSplit(Array(0.9, 0.1))

trainData.cache()

cvData.cache()

val allItemIDs = allData.map(_.product).distinct().collect()

val bAllItemIDs = sc.broadcast(allItemIDs)

val mostListenedAUC = RunAUC.areaUnderCurve(cvData, bAllItemIDs, RunAUC.predictMostListened(sc, trainData))

println(mostListenedAUC)

//0.9395286660878177

trainData.unpersist()

cvData.unpersist()

5 推荐

val someUsers = allData.map(_.user).distinct().take(100)

val someRecommendations = someUsers.map(userID => model.recommendProducts(userID, 5))

someRecommendations.map(

  recs => recs.head.user + " -> " + recs.map(_.product).mkString(", ")

).foreach(println)

附录

RunAUC.scala

import org.apache.spark.SparkContext

import org.apache.spark.SparkContext._

import org.apache.spark.broadcast.Broadcast

import org.apache.spark.mllib.recommendation._

import org.apache.spark.rdd.RDD

import scala.collection.Map

import scala.collection.mutable.ArrayBuffer

import scala.util.Random

/**

  * Created by erichan

  * on 16/1/26.

  */

object RunAUC {

  def areaUnderCurve(

                      positiveData: RDD[Rating],

                      bAllItemIDs: Broadcast[Array[Int]],

                      predictFunction: (RDD[(Int,Int)] => RDD[Rating])) = {

    // What this actually computes is AUC, per user. The result is actually something

    // that might be called "mean AUC".

    // Take held-out data as the "positive", and map to tuples

    val positiveUserProducts = positiveData.map(r => (r.user, r.product))

    // Make predictions for each of them, including a numeric score, and gather by user

    val positivePredictions = predictFunction(positiveUserProducts).groupBy(_.user)

    // BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of

    // small AUC problems, and it would be inefficient, when a direct computation is available.

    // Create a set of "negative" products for each user. These are randomly chosen

    // from among all of the other items, excluding those that are "positive" for the user.

    val negativeUserProducts = positiveUserProducts.groupByKey().mapPartitions {

      // mapPartitions operates on many (user,positive-items) pairs at once

      userIDAndPosItemIDs => {

        // Init an RNG and the item IDs set once for partition

        val random = new Random()

        val allItemIDs = bAllItemIDs.value

        userIDAndPosItemIDs.map { case (userID, posItemIDs) =>

          val posItemIDSet = posItemIDs.toSet

          val negative = new ArrayBuffer[Int]()

          var i = 0

          // Keep about as many negative examples per user as positive.

          // Duplicates are OK

          while (i < allItemIDs.size && negative.size < posItemIDSet.size) {

            val itemID = allItemIDs(random.nextInt(allItemIDs.size))

            if (!posItemIDSet.contains(itemID)) {

              negative += itemID

            }

            i += 1

          }

          // Result is a collection of (user,negative-item) tuples

          negative.map(itemID => (userID, itemID))

        }

      }

    }.flatMap(t => t)

    // flatMap breaks the collections above down into one big set of tuples

    // Make predictions on the rest:

    val negativePredictions = predictFunction(negativeUserProducts).groupBy(_.user)

    // Join positive and negative by user

    positivePredictions.join(negativePredictions).values.map {

      case (positiveRatings, negativeRatings) =>

        // AUC may be viewed as the probability that a random positive item scores

        // higher than a random negative one. Here the proportion of all positive-negative

        // pairs that are correctly ranked is computed. The result is equal to the AUC metric.

        var correct = 0L

        var total = 0L

        // For each pairing,

        for (positive <- positiveRatings;

             negative <- negativeRatings) {

          // Count the correctly-ranked pairs

          if (positive.rating > negative.rating) {

            correct += 1

          }

          total += 1

        }

        // Return AUC: fraction of pairs ranked correctly

        correct.toDouble / total

    }.mean() // Return mean AUC over users

  }

  def predictMostListened(sc: SparkContext, train: RDD[Rating])(allData: RDD[(Int,Int)]) = {

    val bListenCount =

      sc.broadcast(train.map(r => (r.product, r.rating)).reduceByKey(_ + _).collectAsMap())

    allData.map { case (user, product) =>

      Rating(user, product, bListenCount.value.getOrElse(product, 0.0))

    }

  }

  def buildArtistAlias(rawArtistAlias: RDD[String]): Map[Int,Int] =

    rawArtistAlias.flatMap { line =>

      val tokens = line.split('\t')

      if (tokens(0).isEmpty) {

        None

      } else {

        Some((tokens(0).toInt, tokens(1).toInt))

      }

    }.collectAsMap()

  def buildRatings(

                    rawUserArtistData: RDD[String],

                    bArtistAlias: Broadcast[Map[Int,Int]]) = {

    rawUserArtistData.map { line =>

      val Array(userID, artistID, count) = line.split(' ').map(_.toInt)

      val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)

      Rating(userID, finalArtistID, count)

    }

  }

}

Spark高级数据分析· 3推荐引擎的更多相关文章

基于Spark ALS构建商品推荐引擎
基于Spark ALS构建商品推荐引擎一般来讲,推荐引擎试图对用户与某类物品之间的联系建模,其想法是预测人们可能喜好的物品并通过探索物品之间的联系来辅助这个过程,让用户能更快速.更准确的获得所需 ...
Spark高级数据分析——纽约出租车轨迹的空间和时间数据分析
Spark高级数据分析--纽约出租车轨迹的空间和时间数据分析一.地理空间分析: 二.pom.xml 原文地址:https://www.jianshu.com/p/eb6f3e0c09b5 作者:II ...
Spark高级数据分析-第2章用Scala和Spark进行数据分析
2.4 小试牛刀:Spark shell和SparkContext 本章使用的资料来自加州大学欧文分校机器学习资料库(UC Irvine Machine Learning Repository),这个 ...
Spark高级数据分析中文版-读者交流
第二章: 备注:1.本书第二章样例数据由于才有的是短链接,国内的用户可能无法下载.我把数据集拷贝到百度网盘上.大家可以从这个地方下载:http://pan.baidu.com/s/1pJvjHA7 谢 ...
Spark高级数据分析· 6LSA
潜在语义分析 wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz ...
Spark高级数据分析· 2数据分析
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip 数据清洗 cd /Users/eri ...
0-Spark高级数据分析-读书笔记
学完了<Spark快速大数据分析>,对Spark有了一些了解,计划更近一步,开始学习<Spark高级数据分析>.这本书是用Scala写的,在学习的过程中想把其中的代码转换成Ja ...
Spark 实践——用 Scala 和 Spark 进行数据分析
本文基于<Spark 高级数据分析>第2章用Scala和Spark进行数据分析. 完整代码见 https://github.com/libaoquan95/aasPractice/tre ...
基于Azure构建PredictionIO和Spark的推荐引擎服务
基于Azure构建PredictionIO和Spark的推荐引擎服务 1. 在Azure构建Ubuntu 16.04虚拟机假设前提条件您已有 Azure 帐号,登陆 Azure https://po ...

随机推荐

使用js里面的迭代器filter实现数组去重
实现数组去重的方法很多,最原始的方法是一个值一个值的去遍历,写到空数组里面: let r=[],arr = ['a', 'b', 'c', 'a']; for(var i=0,len=arr.leng ...
【BZOJ4832】[Lydsy2017年4月月赛]抵制克苏恩概率与期望
[BZOJ4832][Lydsy2017年4月月赛]抵制克苏恩 Description 小Q同学现在沉迷炉石传说不能自拔.他发现一张名为克苏恩的牌很不公平.如果你不玩炉石传说,不必担心,小Q同学会告诉 ...
探讨Java I/O类和接口
(输出)Output:程序---->数据源(如某个文件) (输入)Input:数据源---->程序 Java.io定义的I/O类如下表所示: BufferedInputStream Buf ...
160329(二)、web.xml配置详解
1.启动一个WEB项目的时候,WEB容器会去读取它的配置文件web.xml,读取<listener>和<context-param>两个结点. 2.紧急着,容创建一个Servl ...
FZU 2144 Shooting Game （贪心区域划分）
Problem 2144 Shooting Game Accept: 370 Submit: 1902 Time Limit: 1000 mSec Memory Limit : 32768 KB Pr ...
jquery筛选数组方法——$.grep(),$.map()
function greptest() { var arr = "1,2,3,'',one,two,three".split(','); var newarr = $.grep(a ...
git 学习(2)--恢复版本
查看修改历史记录 $ git log commit fba77877d316436c1b774b8933380ebcac668040 Author: keith <ustbfxx@163.com ...
如何将计算机加入域分类： AD域 Windows服务 2015-06-10 11:04 63人阅读评论(0) 收藏
在上一篇博客中我已经实现了windows server 2008 R2域中的DC部署,那么如何将计算机加入到我们部署的域环境中呢? (初级教程,step by step,不足之处欢迎批评指正!) 将计 ...
多线程入门-第五章-线程的调度与控制之yield
yield与sleep类似,只是不能指定暂停多长时间,并且只能让同优先级的线程有执行的机会,让位时间不固定. /* yield使用 */ public class ThreadTest04 { pub ...
监控之snmpd 服务
监控离不开数据采集,经常使用的Mrtg ,Cacti,Zabbix,等等监控软件都是通过snmp 协议进行数据采集的! 1 什么是snmp 协议? 简单网络管理协议(SNMP,Simple Netwo ...

Spark高级数据分析· 3推荐引擎

推荐算法流程

预备

1 准备数据

2 建模

3 检验

4 评价

5 推荐

附录

Spark高级数据分析· 3推荐引擎的更多相关文章

随机推荐

热门专题