Spark高级数据分析· 6LSA

潜在语义分析

wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2

1 获取数据

def readFile(path: String, sc: SparkContext): RDD[String] = {

  val conf = new Configuration()

  conf.set(XmlInputFormat.START_TAG_KEY, "<page>")

  conf.set(XmlInputFormat.END_TAG_KEY, "</page>")

  val rawXmls = sc.newAPIHadoopFile(path, classOf[XmlInputFormat], classOf[LongWritable],

    classOf[Text], conf)

  rawXmls.map(p => p._2.toString)

}

//Returns a (title, content) pair

def wikiXmlToPlainText(pageXml: String): Option[(String, String)] = {

  val page = new EnglishWikipediaPage()

  WikipediaPage.readPage(page, pageXml)

  if (page.isEmpty || !page.isArticle || page.isRedirect ||

      page.getTitle.contains("(disambiguation)")) {

    None

  } else {

    Some((page.getTitle, page.getContent))

  }

}

val pages = readFile("hdfs:///user/ds/Wikipedia/", sc)

   .sample(false, sampleSize, 11L)

val plainText = pages.filter(_ != null).flatMap(wikiXmlToPlainText)

2 词形归并

def createNLPPipeline(): StanfordCoreNLP = {

  val props = new Properties()

  props.put("annotators", "tokenize, ssplit, pos, lemma")

  new StanfordCoreNLP(props)

}

def isOnlyLetters(str: String): Boolean = {

  // While loop for high performance

  var i = 0

  while (i < str.length) {

    if (!Character.isLetter(str.charAt(i))) {

      return false

    }

    i += 1

  }

  true

}

def plainTextToLemmas(text: String, stopWords: Set[String], pipeline: StanfordCoreNLP)

  : Seq[String] = {

  val doc = new Annotation(text)

  pipeline.annotate(doc)

  val lemmas = new ArrayBuffer[String]()

  val sentences = doc.get(classOf[SentencesAnnotation])

  for (sentence <- sentences.asScala;

       token <- sentence.get(classOf[TokensAnnotation]).asScala) {

    val lemma = token.get(classOf[LemmaAnnotation])

    if (lemma.length > 2 && !stopWords.contains(lemma) && isOnlyLetters(lemma)) {

      lemmas += lemma.toLowerCase

    }

  }

  lemmas

}

val stopWords = sc.broadcast(loadStopWords("stopwords.txt")).value

val lemmatized = plainText.mapPartitions(iter => {

  val pipeline = createNLPPipeline()

  iter.map{ case(title, contents) => (title, plainTextToLemmas(contents, stopWords, pipeline))}

})

3 TF-IDF

def documentTermMatrix(docs: RDD[(String, Seq[String])], stopWords: Set[String], numTerms: Int,

     sc: SparkContext): (RDD[Vector], Map[Int, String], Map[Long, String], Map[String, Double]) = {

   val docTermFreqs = docs.mapValues(terms => {

     val termFreqsInDoc = terms.foldLeft(new HashMap[String, Int]()) {

       (map, term) => map += term -> (map.getOrElse(term, 0) + 1)

     }

     termFreqsInDoc

   })

   docTermFreqs.cache()

   val docIds = docTermFreqs.map(_._1).zipWithUniqueId().map(_.swap).collectAsMap()

   val docFreqs = documentFrequenciesDistributed(docTermFreqs.map(_._2), numTerms)

   println("Number of terms: " + docFreqs.size)

   saveDocFreqs("docfreqs.tsv", docFreqs)

   val numDocs = docIds.size

   val idfs = inverseDocumentFrequencies(docFreqs, numDocs)

   // Maps terms to their indices in the vector

   val idTerms = idfs.keys.zipWithIndex.toMap

   val termIds = idTerms.map(_.swap)

   val bIdfs = sc.broadcast(idfs).value

   val bIdTerms = sc.broadcast(idTerms).value

   val vecs = docTermFreqs.map(_._2).map(termFreqs => {

     val docTotalTerms = termFreqs.values.sum

     val termScores = termFreqs.filter {

       case (term, freq) => bIdTerms.contains(term)

     }.map{

       case (term, freq) => (bIdTerms(term), bIdfs(term) * termFreqs(term) / docTotalTerms)

     }.toSeq

     Vectors.sparse(bIdTerms.size, termScores)

   })

   (vecs, termIds, docIds, idfs)

 }

 def documentFrequencies(docTermFreqs: RDD[HashMap[String, Int]]): HashMap[String, Int] = {

   val zero = new HashMap[String, Int]()

   def merge(dfs: HashMap[String, Int], tfs: HashMap[String, Int])

     : HashMap[String, Int] = {

     tfs.keySet.foreach { term =>

       dfs += term -> (dfs.getOrElse(term, 0) + 1)

     }

     dfs

   }

   def comb(dfs1: HashMap[String, Int], dfs2: HashMap[String, Int])

     : HashMap[String, Int] = {

     for ((term, count) <- dfs2) {

       dfs1 += term -> (dfs1.getOrElse(term, 0) + count)

     }

     dfs1

   }

   docTermFreqs.aggregate(zero)(merge, comb)

 }

 def documentFrequenciesDistributed(docTermFreqs: RDD[HashMap[String, Int]], numTerms: Int)

     : Array[(String, Int)] = {

   val docFreqs = docTermFreqs.flatMap(_.keySet).map((_, 1)).reduceByKey(_ + _, 15)

   val ordering = Ordering.by[(String, Int), Int](_._2)

   docFreqs.top(numTerms)(ordering)

 }

 def trimLeastFrequent(freqs: Map[String, Int], numToKeep: Int): Map[String, Int] = {

   freqs.toArray.sortBy(_._2).take(math.min(numToKeep, freqs.size)).toMap

 }

 def inverseDocumentFrequencies(docFreqs: Array[(String, Int)], numDocs: Int)

   : Map[String, Double] = {

   docFreqs.map{ case (term, count) => (term, math.log(numDocs.toDouble / count))}.toMap

 }

4 奇异值分解

termDocMatrix.cache()

val mat = new RowMatrix(termDocMatrix)

val svd = mat.computeSVD(k, computeU=true)

def topTermsInTopConcepts(svd: SingularValueDecomposition[RowMatrix, Matrix], numConcepts: Int,

    numTerms: Int, termIds: Map[Int, String]): Seq[Seq[(String, Double)]] = {

  val v = svd.V

  val topTerms = new ArrayBuffer[Seq[(String, Double)]]()

  val arr = v.toArray

  for (i <- 0 until numConcepts) {

    val offs = i * v.numRows

    val termWeights = arr.slice(offs, offs + v.numRows).zipWithIndex

    val sorted = termWeights.sortBy(-_._1)

    topTerms += sorted.take(numTerms).map{case (score, id) => (termIds(id), score)}

  }

  topTerms

}

def topDocsInTopConcepts(svd: SingularValueDecomposition[RowMatrix, Matrix], numConcepts: Int,

    numDocs: Int, docIds: Map[Long, String]): Seq[Seq[(String, Double)]] = {

  val u  = svd.U

  val topDocs = new ArrayBuffer[Seq[(String, Double)]]()

  for (i <- 0 until numConcepts) {

    val docWeights = u.rows.map(_.toArray(i)).zipWithUniqueId

    topDocs += docWeights.top(numDocs).map{case (score, id) => (docIds(id), score)}

  }

  topDocs

}

val topConceptTerms = topTermsInTopConcepts(svd, 10, 10, termIds)

val topConceptDocs = topDocsInTopConcepts(svd, 10, 10, docIds)

for ((terms, docs) <- topConceptTerms.zip(topConceptDocs)) {

  println("Concept terms: " + terms.map(_._1).mkString(", "))

  println("Concept docs: " + docs.map(_._1).mkString(", "))

  println()

}

5 相关度

import breeze.linalg.{DenseMatrix => BDenseMatrix, DenseVector => BDenseVector,

SparseVector => BSparseVector}

def topTermsForTerm(normalizedVS: BDenseMatrix[Double], termId: Int): Seq[(Double, Int)] = {

  // Look up the row in VS corresponding to the given term ID.

  val termRowVec = new BDenseVector[Double](row(normalizedVS, termId).toArray)

  // Compute scores against every term

  val termScores = (normalizedVS * termRowVec).toArray.zipWithIndex

  // Find the terms with the highest scores

  termScores.sortBy(-_._1).take(10)

}

def topDocsForDoc(normalizedUS: RowMatrix, docId: Long): Seq[(Double, Long)] = {

  // Look up the row in US corresponding to the given doc ID.

  val docRowArr = row(normalizedUS, docId)

  val docRowVec = Matrices.dense(docRowArr.length, 1, docRowArr)

  // Compute scores against every doc

  val docScores = normalizedUS.multiply(docRowVec)

  // Find the docs with the highest scores

  val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId

  // Docs can end up with NaN score if their row in U is all zeros.  Filter these out.

  allDocWeights.filter(!_._1.isNaN).top(10)

}

def topDocsForTerm(US: RowMatrix, V: Matrix, termId: Int): Seq[(Double, Long)] = {

  val termRowArr = row(V, termId).toArray

  val termRowVec = Matrices.dense(termRowArr.length, 1, termRowArr)

  // Compute scores against every doc

  val docScores = US.multiply(termRowVec)

  // Find the docs with the highest scores

  val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId

  allDocWeights.top(10)

}

多词项查询

def termsToQueryVector(terms: Seq[String], idTerms: Map[String, Int], idfs: Map[String, Double])

  : BSparseVector[Double] = {

  val indices = terms.map(idTerms(_)).toArray

  val values = terms.map(idfs(_)).toArray

  new BSparseVector[Double](indices, values, idTerms.size)

}

def topDocsForTermQuery(US: RowMatrix, V: Matrix, query: BSparseVector[Double])

  : Seq[(Double, Long)] = {

  val breezeV = new BDenseMatrix[Double](V.numRows, V.numCols, V.toArray)

  val termRowArr = (breezeV.t * query).toArray

  val termRowVec = Matrices.dense(termRowArr.length, 1, termRowArr)

  // Compute scores against every doc

  val docScores = US.multiply(termRowVec)

  // Find the docs with the highest scores

  val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId

  allDocWeights.top(10)

}

Spark高级数据分析· 6LSA的更多相关文章

Spark高级数据分析——纽约出租车轨迹的空间和时间数据分析
Spark高级数据分析--纽约出租车轨迹的空间和时间数据分析一.地理空间分析: 二.pom.xml 原文地址:https://www.jianshu.com/p/eb6f3e0c09b5 作者:II ...
Spark高级数据分析· 3推荐引擎
推荐算法流程推荐算法预备 wget http://www.iro.umontreal.ca/~lisa/datasets/profiledata_06-May-2005.tar.gz cd /Us ...
Spark高级数据分析-第2章用Scala和Spark进行数据分析
2.4 小试牛刀:Spark shell和SparkContext 本章使用的资料来自加州大学欧文分校机器学习资料库(UC Irvine Machine Learning Repository),这个 ...
Spark高级数据分析中文版-读者交流
第二章: 备注:1.本书第二章样例数据由于才有的是短链接,国内的用户可能无法下载.我把数据集拷贝到百度网盘上.大家可以从这个地方下载:http://pan.baidu.com/s/1pJvjHA7 谢 ...
Spark高级数据分析· 2数据分析
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip 数据清洗 cd /Users/eri ...
0-Spark高级数据分析-读书笔记
学完了<Spark快速大数据分析>,对Spark有了一些了解,计划更近一步,开始学习<Spark高级数据分析>.这本书是用Scala写的,在学习的过程中想把其中的代码转换成Ja ...
Spark 实践——用 Scala 和 Spark 进行数据分析
本文基于<Spark 高级数据分析>第2章用Scala和Spark进行数据分析. 完整代码见 https://github.com/libaoquan95/aasPractice/tre ...
Apache Spark大数据分析入门（一）
摘要:Apache Spark的出现让普通人也具备了大数据及实时数据分析能力.鉴于此,本文通过动手实战操作演示带领大家快速地入门学习Spark.本文是Apache Spark入门系列教程(共四部分)的 ...
大数据学习：Spark是什么，如何用Spark进行数据分析
给大家分享一下Spark是什么?如何用Spark进行数据分析,对大数据感兴趣的小伙伴就随着小编一起来了解一下吧. 大数据在线学习什么是Apache Spark? Apache Spark是一 ...

随机推荐

Spring学习笔记--使用Spring基于Java的配置
我们需要使用@Component注解来定义一个配置类,在配置类中我们定义Bean: package com.moonlit.myspring; import org.springframework.c ...
Android遍历SqlLite cursor对象：
//1. Cursor c =...; for(c.moveToFirst(); ! c.isAfterLast(); c.moveToNext()){ //c… } //2. Cursor curs ...
oracle导入sql文件
oracle导入sql文件: 1.进入到sql文件目录下,登录需要导入文件的用户打开cmd,输入以下命令,进入oracle, sqlplus username/password username:需 ...
[报错]编译报错：clang: error: linker command failed with exit code 1及duplicate symbol xxxx in错误解决方法之一
今天添加了一个新类(包括m,h,xib文件),还没有调用,—编译遇到如下错误,根据错误提示, duplicate symbol param1 in: /Users/xxxx/Library/Devel ...
记录一次gitlab->github企业版的迁移
cd到你想要存放新的工程的文件夹内, 1.使用git clone --mirror命令制作旧git的镜像 $ git clone --mirror git@git.aaaa.com:mario/my- ...
mysql数据库基本知识，简单框架
https://www.cnblogs.com/geaozhang/p/7347950.html
【转】Spring Boot 日志配置(超详细)
更新日志: 20170810 更新通过 application.yml传递参数到 logback 中. [toc] 简书不支持目录,截图一张. image.png 默认日志 Logback: 默认情况 ...
MapReduce的核心编程思想
1.MapReduce的核心编程思想 2.yarn集群工作机制 3.maptask并行度与决定机制 4.maptask工作机制 5.MapReduce整体流程 6.shuffle机制 7.yarn架构
关于sails 初学者常见问题汇总
http://sailsdoc.swift.ren/ 这里有 sails中文文档一.安装时: 先装nodejs,成功标志 node -v 安装sails 全局安装 node install sail ...
Kafka笔记整理（一）
Kafka简介消息队列(Message Queue) 消息 Message 网络中的两台计算机或者两个通讯设备之间传递的数据.例如说:文本.音乐.视频等内容. 队列 Queue 一种特殊的线性表(数 ...

Spark高级数据分析· 6LSA

1 获取数据

2 词形归并

3 TF-IDF

4 奇异值分解

5 相关度

多词项查询

Spark高级数据分析· 6LSA的更多相关文章

随机推荐

热门专题