Spark MLlib - LFW
val path = "/usr/data/lfw-a/*" val rdd = sc.wholeTextFiles(path) val first = rdd.first println(first)
val files = rdd.map { case (fileName, content) =>
fileName.replace("file:", "") }
println(files.first)
println(files.count)
%pyspark import matplotlib.pyplot as plt path = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" ae = plt.imread(path) plt.imshow(ae) plt.show()
import java.awt.image.BufferedImage
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
val aePath = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" val aeImage = loadImageFromFile(aePath)
import java.awt.image
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
val grayImage = processImage(aeImage, , )
import javax.imageio.ImageIO
import java.io.File
ImageIO.write(grayImage, "jpg", new File("/tmp/aeGray.jpg"))
%pyspark import matplotlib.pyplot as plt tmpPath = "/tmp/aeGray.jpg" aeGary = plt.imread(tmpPath) plt.imshow(aeGary, cmap=plt.cm.gray) plt.show()
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
println(pixels.take().map(_.take().mkString
("", ",", ", ...")).mkString("\n"))
import org.apache.spark.mllib.linalg.Vectors
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.feature.StandardScaler val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix val matrix = new RowMatrix(scaledVectors) val K = val pc = matrix.computePrincipalComponents(K)
val rows = pc.numRows val cols = pc.numCols println(rows, cols)
import breeze.linalg.DenseMatrix
val pcBreeze = new DenseMatrix(rows, cols, pc.toArray)
import breeze.linalg.csvwrite
csvwrite(new File("/tmp/pc.csv"), pcBreeze)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
pcs = np.loadtxt("/tmp/pc.csv", delimiter=",")
print(pcs.shape)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
def plot_gallery(images, h, w, n_row=, n_col=):
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=, left=., right=., top=.,hspace=.)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + )
plt.imshow(images[:, i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(), size=)
plt.xticks(())
plt.yticks(())
%pyspark plot_gallery(pcs, , ) plt.show()
val projected = matrix.multiply(pc) println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val svd = matrix.computeSVD(, computeU = true)
println(s"U dimension: (${svd.U.numRows}, ${svd.U.numCols})")
println(s"S dimension: (${svd.s.size}, )")
println(s"V dimension: (${svd.V.numRows}, ${svd.V.numCols})")
def approxEqual(array1: Array[Double], array2: Array[Double],
tolerance: Double = 1e-): Boolean = {
val bools = array1.zip(array2).map { case (v1, v2) => if
(math.abs(math.abs(v1) - math.abs(v2)) > 1e-) false else true }
bools.fold(true)(_ & _)
}
val breezeS = breeze.linalg.DenseVector(svd.s.toArray)
val projectedSVD = svd.U.rows.map { v =>
val breezeV = breeze.linalg.DenseVector(v.toArray)
val multV = breezeV :* breezeS
Vectors.dense(multV.data)
}
projected.rows.zip(projectedSVD).map { case (v1, v2) =>
approxEqual(v1.toArray, v2.toArray) }.filter(b => true).count
val sValues = ( to ).map { i => matrix.computeSVD(i, computeU =
false).s }
sValues.foreach(println)
val svd300 = matrix.computeSVD(, computeU = false)
val sMatrix = , , svd300.s.toArray)
csvwrite(new File("/tmp/s.csv"), sMatrix)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt("/tmp/s.csv", delimiter=",")
print(s.shape)
plt.plot(s)
plt.show()
%pyspark
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt("/tmp/s.csv", delimiter=",")
plt.plot(np.cumsum(s))
plt.yscale('log')
plt.show()
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 16-10-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.take(10).mkString("\n"))
var i = 0
val fs = files.map{ case ( add ) =>
i += 1
(i.toString+"|"+add)
}
val titles = fs.map(line => line.split("\\|").take(2)).map(array
=> (array(0).toInt,
array(1))).collectAsMap()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 2016-12-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val fs = files.map{ case ( add ) =>
j +=
(j.toString+"|"+add)
}
val t = fs.map(line => line.split()).map(array=>
(array(),array())).collectAsMap()
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
println(t(.toString))
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
val add = t(id.toString)
(id, sim ,add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.take().mkString("\n"))
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 2016-12-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val fs = files.map{ case ( add ) =>
j +=
(j.toString+"|"+add)
}
val t = fs.map(line => line.split()).map(array=>
(array(),array())).collectAsMap()
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
val add = t(id.toString)
(id, sim ,add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.slice(,)mkString("\n"))
}
}
val item = sc.textFile("/usr/data/Art.item")
val t = item.map(line => line.split()).map(array=>
(array().toInt,array())
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
val item = sc.textFile("/home/ysp/Art.item")
val t = item.map(line => line.split()).map(array=>
(array().toInt,array())).collectAsMap()
val fs = item.map(line => line.split())
val file = fs.map(array=>array())
println(file.count)
file.first()
t()
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
file.first()
val pixels = file.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val ps = projected.rows.take().map{ case ( x ) =>
i = i+
val y= x.toArray
val address = t(i)
(i, y, address)
}
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor, add) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim, add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println()
println("推荐:")
println(sortedSims.slice(,)mkString("\n"))
Spark MLlib - LFW的更多相关文章
- 《Spark MLlib机器学习实践》内容简介、目录
http://product.dangdang.com/23829918.html Spark作为新兴的.应用范围最为广泛的大数据处理开源框架引起了广泛的关注,它吸引了大量程序设计和开发人员进行相 ...
- Spark MLlib 之 Basic Statistics
Spark MLlib提供了一些基本的统计学的算法,下面主要说明一下: 1.Summary statistics 对于RDD[Vector]类型,Spark MLlib提供了colStats的统计方法 ...
- Spark MLlib Data Type
MLlib 支持存放在单机上的本地向量和矩阵,也支持通过多个RDD实现的分布式矩阵.因此MLlib的数据类型主要分为两大类:一个是本地单机向量:另一个是分布式矩阵.下面分别介绍一下这两大类都有哪些类型 ...
- Spark MLlib - Decision Tree源码分析
http://spark.apache.org/docs/latest/mllib-decision-tree.html 以决策树作为开始,因为简单,而且也比较容易用到,当前的boosting或ran ...
- Spark入门实战系列--8.Spark MLlib(上)--机器学习及SparkMLlib简介
[注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .机器学习概念 1.1 机器学习的定义 在维基百科上对机器学习提出以下几种定义: l“机器学 ...
- Spark入门实战系列--8.Spark MLlib(下)--机器学习库SparkMLlib实战
[注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .MLlib实例 1.1 聚类实例 1.1.1 算法说明 聚类(Cluster analys ...
- spark mllib配置pom.xml错误 Multiple markers at this line Could not transfer artifact net.sf.opencsv:opencsv:jar:2.3 from/to central (https://repo.maven.apache.org/maven2): repo.maven.apache.org
刚刚spark mllib,在maven repository网站http://mvnrepository.com/中查询mllib后得到相关库的最新dependence为: <dependen ...
- Apache Spark源码走读之23 -- Spark MLLib中拟牛顿法L-BFGS的源码实现
欢迎转载,转载请注明出处,徽沪一郎. 概要 本文就拟牛顿法L-BFGS的由来做一个简要的回顾,然后就其在spark mllib中的实现进行源码走读. 拟牛顿法 数学原理 代码实现 L-BFGS算法中使 ...
- spark Mllib基本功系列编程入门之 SVM实现分类
话不多说.直接上代码咯.欢迎交流. /** * Created by whuscalaman on 1/7/16. */import org.apache.spark.{SparkConf, Spar ...
随机推荐
- 模仿东京首页banner轮播,京东新闻上下滚动动画实现(动画实现)
接着上篇 微信小程序-阅读小程序demo写:http://www.cnblogs.com/muyixiaoguang/p/5917986.html 首页banner动画实现 京东新闻上下动画实现 想着 ...
- SSH(Struts2+Spring+Hibernate)框架搭建流程
添加支持 我先介绍的是MyEclipse9的自带框架支持搭建过程:(完全的步骤 傻瓜式的学习..~) 首先我们来搭建一个Web项目: 一.Hibernate(数据层)的搭建: 相关描述 Ⅰ.服务器与数 ...
- 利用XML FOR PATH 合并分组信息
-- ================================================ -- Description:合并分组内容 -- Author:夏保华 -- Date:2009 ...
- .NET 实现并行的几种方式(三)
本随笔续接:.NET 实现并行的几种方式(二) 在前两篇随笔中,先后介绍了 Thread .ThreadPool .IAsyncResult (即 APM系列) .Task .TPL (Task Pa ...
- thinkphp怎么修改配置进入默认首页
thinkphp文件夹下config 里面有个convention.php文件 里面有三个配置 'DEFAULT_MODULE' => 'Home', // 默认模块 'DEFAULT_CONT ...
- Entity Framework Code First Migrations--EF 的数据迁移
1. 为了演示方便,首先新建一个控制台项目,然后添加对entityframework的引用 使用nuget控制台执行: Install-Package EntityFramework 2.新建一个实体 ...
- spider RPC性能测试报告
测试环境部署结构 测试用例 类 别 说明 请求报文 194字节({"systemId":"PL","appVersion":"qq ...
- Bootstrap之导航条
基本导航条 <!-- navbar-inverse相反颜色风格 --> <!-- navbar-static-top去除圆角 --> <!-- navbar-fixed- ...
- 最快让你上手ReactiveCocoa之基础篇
前言 很多blog都说ReactiveCocoa好用,然后各种秀自己如何灵活运用ReactiveCocoa,但是感觉真正缺少的是一篇如何学习ReactiveCocoa的文章,这里介绍一下. 1.Rea ...
- Visual Studio 2013 Preview 高清多图先睹为快
Visual Studio 2013 Preview已经发布.大家可以下载试用了哦: 选项加载明显比之前版本要快很多.