Spark高级数据分析· 6LSA

wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2
1 获取数据
def readFile(path: String, sc: SparkContext): RDD[String] = {
val conf = new Configuration()
conf.set(XmlInputFormat.START_TAG_KEY, "<page>")
conf.set(XmlInputFormat.END_TAG_KEY, "</page>")
val rawXmls = sc.newAPIHadoopFile(path, classOf[XmlInputFormat], classOf[LongWritable],
classOf[Text], conf)
rawXmls.map(p => p._2.toString)
}
//Returns a (title, content) pair
def wikiXmlToPlainText(pageXml: String): Option[(String, String)] = {
val page = new EnglishWikipediaPage()
WikipediaPage.readPage(page, pageXml)
if (page.isEmpty || !page.isArticle || page.isRedirect ||
page.getTitle.contains("(disambiguation)")) {
None
} else {
Some((page.getTitle, page.getContent))
}
}
val pages = readFile("hdfs:///user/ds/Wikipedia/", sc)
.sample(false, sampleSize, 11L)
val plainText = pages.filter(_ != null).flatMap(wikiXmlToPlainText)
2 词形归并
def createNLPPipeline(): StanfordCoreNLP = {
val props = new Properties()
props.put("annotators", "tokenize, ssplit, pos, lemma")
new StanfordCoreNLP(props)
}
def isOnlyLetters(str: String): Boolean = {
// While loop for high performance
var i = 0
while (i < str.length) {
if (!Character.isLetter(str.charAt(i))) {
return false
}
i += 1
}
true
}
def plainTextToLemmas(text: String, stopWords: Set[String], pipeline: StanfordCoreNLP)
: Seq[String] = {
val doc = new Annotation(text)
pipeline.annotate(doc)
val lemmas = new ArrayBuffer[String]()
val sentences = doc.get(classOf[SentencesAnnotation])
for (sentence <- sentences.asScala;
token <- sentence.get(classOf[TokensAnnotation]).asScala) {
val lemma = token.get(classOf[LemmaAnnotation])
if (lemma.length > 2 && !stopWords.contains(lemma) && isOnlyLetters(lemma)) {
lemmas += lemma.toLowerCase
}
}
lemmas
}
val stopWords = sc.broadcast(loadStopWords("stopwords.txt")).value
val lemmatized = plainText.mapPartitions(iter => {
val pipeline = createNLPPipeline()
iter.map{ case(title, contents) => (title, plainTextToLemmas(contents, stopWords, pipeline))}
})
3 TF-IDF
def documentTermMatrix(docs: RDD[(String, Seq[String])], stopWords: Set[String], numTerms: Int,
sc: SparkContext): (RDD[Vector], Map[Int, String], Map[Long, String], Map[String, Double]) = {
val docTermFreqs = docs.mapValues(terms => {
val termFreqsInDoc = terms.foldLeft(new HashMap[String, Int]()) {
(map, term) => map += term -> (map.getOrElse(term, 0) + 1)
}
termFreqsInDoc
})
docTermFreqs.cache()
val docIds = docTermFreqs.map(_._1).zipWithUniqueId().map(_.swap).collectAsMap()
val docFreqs = documentFrequenciesDistributed(docTermFreqs.map(_._2), numTerms)
println("Number of terms: " + docFreqs.size)
saveDocFreqs("docfreqs.tsv", docFreqs)
val numDocs = docIds.size
val idfs = inverseDocumentFrequencies(docFreqs, numDocs)
// Maps terms to their indices in the vector
val idTerms = idfs.keys.zipWithIndex.toMap
val termIds = idTerms.map(_.swap)
val bIdfs = sc.broadcast(idfs).value
val bIdTerms = sc.broadcast(idTerms).value
val vecs = docTermFreqs.map(_._2).map(termFreqs => {
val docTotalTerms = termFreqs.values.sum
val termScores = termFreqs.filter {
case (term, freq) => bIdTerms.contains(term)
}.map{
case (term, freq) => (bIdTerms(term), bIdfs(term) * termFreqs(term) / docTotalTerms)
}.toSeq
Vectors.sparse(bIdTerms.size, termScores)
})
(vecs, termIds, docIds, idfs)
}
def documentFrequencies(docTermFreqs: RDD[HashMap[String, Int]]): HashMap[String, Int] = {
val zero = new HashMap[String, Int]()
def merge(dfs: HashMap[String, Int], tfs: HashMap[String, Int])
: HashMap[String, Int] = {
tfs.keySet.foreach { term =>
dfs += term -> (dfs.getOrElse(term, 0) + 1)
}
dfs
}
def comb(dfs1: HashMap[String, Int], dfs2: HashMap[String, Int])
: HashMap[String, Int] = {
for ((term, count) <- dfs2) {
dfs1 += term -> (dfs1.getOrElse(term, 0) + count)
}
dfs1
}
docTermFreqs.aggregate(zero)(merge, comb)
}
def documentFrequenciesDistributed(docTermFreqs: RDD[HashMap[String, Int]], numTerms: Int)
: Array[(String, Int)] = {
val docFreqs = docTermFreqs.flatMap(_.keySet).map((_, 1)).reduceByKey(_ + _, 15)
val ordering = Ordering.by[(String, Int), Int](_._2)
docFreqs.top(numTerms)(ordering)
}
def trimLeastFrequent(freqs: Map[String, Int], numToKeep: Int): Map[String, Int] = {
freqs.toArray.sortBy(_._2).take(math.min(numToKeep, freqs.size)).toMap
}
def inverseDocumentFrequencies(docFreqs: Array[(String, Int)], numDocs: Int)
: Map[String, Double] = {
docFreqs.map{ case (term, count) => (term, math.log(numDocs.toDouble / count))}.toMap
}
4 奇异值分解
termDocMatrix.cache()
val mat = new RowMatrix(termDocMatrix)
val svd = mat.computeSVD(k, computeU=true)
def topTermsInTopConcepts(svd: SingularValueDecomposition[RowMatrix, Matrix], numConcepts: Int,
numTerms: Int, termIds: Map[Int, String]): Seq[Seq[(String, Double)]] = {
val v = svd.V
val topTerms = new ArrayBuffer[Seq[(String, Double)]]()
val arr = v.toArray
for (i <- 0 until numConcepts) {
val offs = i * v.numRows
val termWeights = arr.slice(offs, offs + v.numRows).zipWithIndex
val sorted = termWeights.sortBy(-_._1)
topTerms += sorted.take(numTerms).map{case (score, id) => (termIds(id), score)}
}
topTerms
}
def topDocsInTopConcepts(svd: SingularValueDecomposition[RowMatrix, Matrix], numConcepts: Int,
numDocs: Int, docIds: Map[Long, String]): Seq[Seq[(String, Double)]] = {
val u = svd.U
val topDocs = new ArrayBuffer[Seq[(String, Double)]]()
for (i <- 0 until numConcepts) {
val docWeights = u.rows.map(_.toArray(i)).zipWithUniqueId
topDocs += docWeights.top(numDocs).map{case (score, id) => (docIds(id), score)}
}
topDocs
}
val topConceptTerms = topTermsInTopConcepts(svd, 10, 10, termIds)
val topConceptDocs = topDocsInTopConcepts(svd, 10, 10, docIds)
for ((terms, docs) <- topConceptTerms.zip(topConceptDocs)) {
println("Concept terms: " + terms.map(_._1).mkString(", "))
println("Concept docs: " + docs.map(_._1).mkString(", "))
println()
}
5 相关度
import breeze.linalg.{DenseMatrix => BDenseMatrix, DenseVector => BDenseVector,
SparseVector => BSparseVector}
def topTermsForTerm(normalizedVS: BDenseMatrix[Double], termId: Int): Seq[(Double, Int)] = {
// Look up the row in VS corresponding to the given term ID.
val termRowVec = new BDenseVector[Double](row(normalizedVS, termId).toArray)
// Compute scores against every term
val termScores = (normalizedVS * termRowVec).toArray.zipWithIndex
// Find the terms with the highest scores
termScores.sortBy(-_._1).take(10)
}
def topDocsForDoc(normalizedUS: RowMatrix, docId: Long): Seq[(Double, Long)] = {
// Look up the row in US corresponding to the given doc ID.
val docRowArr = row(normalizedUS, docId)
val docRowVec = Matrices.dense(docRowArr.length, 1, docRowArr)
// Compute scores against every doc
val docScores = normalizedUS.multiply(docRowVec)
// Find the docs with the highest scores
val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId
// Docs can end up with NaN score if their row in U is all zeros. Filter these out.
allDocWeights.filter(!_._1.isNaN).top(10)
}
def topDocsForTerm(US: RowMatrix, V: Matrix, termId: Int): Seq[(Double, Long)] = {
val termRowArr = row(V, termId).toArray
val termRowVec = Matrices.dense(termRowArr.length, 1, termRowArr)
// Compute scores against every doc
val docScores = US.multiply(termRowVec)
// Find the docs with the highest scores
val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId
allDocWeights.top(10)
}
多词项查询
def termsToQueryVector(terms: Seq[String], idTerms: Map[String, Int], idfs: Map[String, Double])
: BSparseVector[Double] = {
val indices = terms.map(idTerms(_)).toArray
val values = terms.map(idfs(_)).toArray
new BSparseVector[Double](indices, values, idTerms.size)
}
def topDocsForTermQuery(US: RowMatrix, V: Matrix, query: BSparseVector[Double])
: Seq[(Double, Long)] = {
val breezeV = new BDenseMatrix[Double](V.numRows, V.numCols, V.toArray)
val termRowArr = (breezeV.t * query).toArray
val termRowVec = Matrices.dense(termRowArr.length, 1, termRowArr)
// Compute scores against every doc
val docScores = US.multiply(termRowVec)
// Find the docs with the highest scores
val allDocWeights = docScores.rows.map(_.toArray(0)).zipWithUniqueId
allDocWeights.top(10)
}
Spark高级数据分析· 6LSA的更多相关文章
- Spark高级数据分析——纽约出租车轨迹的空间和时间数据分析
Spark高级数据分析--纽约出租车轨迹的空间和时间数据分析 一.地理空间分析: 二.pom.xml 原文地址:https://www.jianshu.com/p/eb6f3e0c09b5 作者:II ...
- Spark高级数据分析· 3推荐引擎
推荐算法流程 推荐算法 预备 wget http://www.iro.umontreal.ca/~lisa/datasets/profiledata_06-May-2005.tar.gz cd /Us ...
- Spark高级数据分析-第2章 用Scala和Spark进行数据分析
2.4 小试牛刀:Spark shell和SparkContext 本章使用的资料来自加州大学欧文分校机器学习资料库(UC Irvine Machine Learning Repository),这个 ...
- Spark高级数据分析中文版-读者交流
第二章: 备注:1.本书第二章样例数据由于才有的是短链接,国内的用户可能无法下载.我把数据集拷贝到百度网盘上.大家可以从这个地方下载:http://pan.baidu.com/s/1pJvjHA7 谢 ...
- Spark高级数据分析· 2数据分析
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip 数据清洗 cd /Users/eri ...
- 0-Spark高级数据分析-读书笔记
学完了<Spark快速大数据分析>,对Spark有了一些了解,计划更近一步,开始学习<Spark高级数据分析>.这本书是用Scala写的,在学习的过程中想把其中的代码转换成Ja ...
- Spark 实践——用 Scala 和 Spark 进行数据分析
本文基于<Spark 高级数据分析>第2章 用Scala和Spark进行数据分析. 完整代码见 https://github.com/libaoquan95/aasPractice/tre ...
- Apache Spark大数据分析入门(一)
摘要:Apache Spark的出现让普通人也具备了大数据及实时数据分析能力.鉴于此,本文通过动手实战操作演示带领大家快速地入门学习Spark.本文是Apache Spark入门系列教程(共四部分)的 ...
- 大数据学习:Spark是什么,如何用Spark进行数据分析
给大家分享一下Spark是什么?如何用Spark进行数据分析,对大数据感兴趣的小伙伴就随着小编一起来了解一下吧. 大数据在线学习 什么是Apache Spark? Apache Spark是一 ...
随机推荐
- ios 给UIImageView添加阴影
_borderView.layer.shadowColor = [UIColor grayColor].CGColor; _borderView.layer.shadowOffset = CGSize ...
- Java使用Apache POI进行Excel导入和导出
Manve依赖 <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <dependency> ...
- 简述泛型、用Maven创建Web项目以及在Web项目上整合SpringMVC
表设计 Timestamp列是否取消"根据当前时间戳自动更新" 是否null及默认值选择合理不合理 外键命名规范及更新和删除时的动作是否合理 泛型 类型参数 --允许在外部指定 ...
- Ubuntu安装atom
sudo add-apt-repository ppa:webupd8team/atom sudo apt-get update sudo apt-get install atom 安装的时如果报错, ...
- 运用JS设置cookie、读取cookie、删除cookiev
JS设置cookie: 假设在A页面中要保存变量username的值("jack")到cookie中,key值为name,则相应的JS代码为: document.cookie=&q ...
- #include <sys/epoll.h> epoll - I/O event notification facility 服务器端 epoll(7) - Linux manual page http://www.man7.org/linux/man-pages/man7/epoll.7.html
epoll使用详解(精髓) - Boblim - 博客园 https://www.cnblogs.com/fnlingnzb-learner/p/5835573.html epoll使用详解(精髓) ...
- Python并行编程(五):线程同步之信号量
1.基本概念 信号量是由操作系统管理的一种抽象数据类型,用于在多线程中同步对共享资源的使用.本质上说,信号量是一个内部数据,用于标明当前的共享资源可以有多少并发读取. 同样在threading中,信号 ...
- sql server常用性能计数器
https://blog.csdn.net/kk185800961/article/details/52462913?utm_source=blogxgwz5 https://blog.csdn.ne ...
- Web爬虫的C#请求发送
public class HttpControler { //post请求发送 private Encoding m_Encoding = Encoding.GetEncoding("gb2 ...
- 003-maven简介
1.1简介 Maven,只是的积累,专家或内行 Maven是优秀的构建工具,依赖管理工具,项目信息管理工具,跨平台.提供了中央仓库,自动下载构件. 1.通过坐标系统定位每一个构件(artifact), ...