Spark MLlib - LFW
val path = "/usr/data/lfw-a/*" val rdd = sc.wholeTextFiles(path) val first = rdd.first println(first)
val files = rdd.map { case (fileName, content) =>
fileName.replace("file:", "") }
println(files.first)
println(files.count)
%pyspark import matplotlib.pyplot as plt path = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" ae = plt.imread(path) plt.imshow(ae) plt.show()
import java.awt.image.BufferedImage
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
val aePath = "/usr/data/lfw-a/Aaron_Eckhart/Aaron_Eckhart_0001.jpg" val aeImage = loadImageFromFile(aePath)
import java.awt.image
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
val grayImage = processImage(aeImage, , )
import javax.imageio.ImageIO
import java.io.File
ImageIO.write(grayImage, "jpg", new File("/tmp/aeGray.jpg"))
%pyspark import matplotlib.pyplot as plt tmpPath = "/tmp/aeGray.jpg" aeGary = plt.imread(tmpPath) plt.imshow(aeGary, cmap=plt.cm.gray) plt.show()
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
println(pixels.take().map(_.take().mkString
("", ",", ", ...")).mkString("\n"))
import org.apache.spark.mllib.linalg.Vectors
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.feature.StandardScaler val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.distributed.RowMatrix val matrix = new RowMatrix(scaledVectors) val K = val pc = matrix.computePrincipalComponents(K)
val rows = pc.numRows val cols = pc.numCols println(rows, cols)
import breeze.linalg.DenseMatrix
val pcBreeze = new DenseMatrix(rows, cols, pc.toArray)
import breeze.linalg.csvwrite
csvwrite(new File("/tmp/pc.csv"), pcBreeze)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
pcs = np.loadtxt("/tmp/pc.csv", delimiter=",")
print(pcs.shape)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
def plot_gallery(images, h, w, n_row=, n_col=):
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=, left=., right=., top=.,hspace=.)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + )
plt.imshow(images[:, i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(), size=)
plt.xticks(())
plt.yticks(())
%pyspark plot_gallery(pcs, , ) plt.show()
val projected = matrix.multiply(pc) println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val svd = matrix.computeSVD(, computeU = true)
println(s"U dimension: (${svd.U.numRows}, ${svd.U.numCols})")
println(s"S dimension: (${svd.s.size}, )")
println(s"V dimension: (${svd.V.numRows}, ${svd.V.numCols})")
def approxEqual(array1: Array[Double], array2: Array[Double],
tolerance: Double = 1e-): Boolean = {
val bools = array1.zip(array2).map { case (v1, v2) => if
(math.abs(math.abs(v1) - math.abs(v2)) > 1e-) false else true }
bools.fold(true)(_ & _)
}
val breezeS = breeze.linalg.DenseVector(svd.s.toArray)
val projectedSVD = svd.U.rows.map { v =>
val breezeV = breeze.linalg.DenseVector(v.toArray)
val multV = breezeV :* breezeS
Vectors.dense(multV.data)
}
projected.rows.zip(projectedSVD).map { case (v1, v2) =>
approxEqual(v1.toArray, v2.toArray) }.filter(b => true).count
val sValues = ( to ).map { i => matrix.computeSVD(i, computeU =
false).s }
sValues.foreach(println)
val svd300 = matrix.computeSVD(, computeU = false)
val sMatrix = , , svd300.s.toArray)
csvwrite(new File("/tmp/s.csv"), sMatrix)
%pyspark
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt("/tmp/s.csv", delimiter=",")
print(s.shape)
plt.plot(s)
plt.show()
%pyspark
import numpy as np
import matplotlib.pyplot as plt
s = np.loadtxt("/tmp/s.csv", delimiter=",")
plt.plot(np.cumsum(s))
plt.yscale('log')
plt.show()
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 16-10-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.take(10).mkString("\n"))
var i = 0
val fs = files.map{ case ( add ) =>
i += 1
(i.toString+"|"+add)
}
val titles = fs.map(line => line.split("\\|").take(2)).map(array
=> (array(0).toInt,
array(1))).collectAsMap()
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 2016-12-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val fs = files.map{ case ( add ) =>
j +=
(j.toString+"|"+add)
}
val t = fs.map(line => line.split()).map(array=>
(array(),array())).collectAsMap()
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
println(t(.toString))
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
val add = t(id.toString)
(id, sim ,add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.take().mkString("\n"))
}
}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
/**
* Created by ysp on 2016-12-30.
*/
object LFW {
def main(args: Array[String]) {
val sc = new SparkContext(new SparkConf().setMaster("local").setAppName("lfw"))
val path = "/usr/data/lfw-a/*"
val rdd = sc.wholeTextFiles(path)
val files = rdd.map { case (fileName, content) => fileName.replace("file:", "") }
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
val pixels = files.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val projecteds = projected.rows.take().map{ case ( x ) =>
i +=
(i,x)
}
val ps = projecteds.map{ case (i,x) =>
val j= i.toInt
val y= x.toArray
(j,y)
}
val fs = files.map{ case ( add ) =>
j +=
(j.toString+"|"+add)
}
val t = fs.map(line => line.split()).map(array=>
(array(),array())).collectAsMap()
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
val add = t(id.toString)
(id, sim ,add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println(sortedSims.slice(,)mkString("\n"))
}
}
val item = sc.textFile("/usr/data/Art.item")
val t = item.map(line => line.split()).map(array=>
(array().toInt,array())
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.{ALS, Rating}
import java.awt.image.BufferedImage
import java.awt.image
import javax.imageio.ImageIO
import java.io.File
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import breeze.linalg.DenseMatrix
import org.jblas.DoubleMatrix
import breeze.linalg.csvwrite
import org.apache.spark.mllib.recommendation.Rating
val item = sc.textFile("/home/ysp/Art.item")
val t = item.map(line => line.split()).map(array=>
(array().toInt,array())).collectAsMap()
val fs = item.map(line => line.split())
val file = fs.map(array=>array())
println(file.count)
file.first()
t()
def loadImageFromFile(path: String): BufferedImage = {
import javax.imageio.ImageIO
import java.io.File
ImageIO.read(new File(path))
}
def processImage(image: BufferedImage, width: Int, height: Int):
BufferedImage = {
val bwImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_GRAY)
val g = bwImage.getGraphics()
g.drawImage(image, , , width, height, null)
g.dispose()
bwImage
}
def getPixelsFromImage(image: BufferedImage): Array[Double] = {
val width = image.getWidth
val height = image.getHeight
val pixels = Array.ofDim[Double](width * height)
image.getData.getPixels(, , width, height, pixels)
}
def extractPixels(path: String, width: Int, height: Int):
Array[Double] = {
val raw = loadImageFromFile(path)
val processed = processImage(raw, width, height)
getPixelsFromImage(processed)
}
file.first()
val pixels = file.map(f => extractPixels(f, , ))
val vectors = pixels.map(p => Vectors.dense(p))
vectors.setName("image-vectors")
vectors.cache
val scaler = new StandardScaler(withMean = true, withStd = false).fit(vectors)
val scaledVectors = vectors.map(v => scaler.transform(v))
val matrix = new RowMatrix(scaledVectors)
val K =
val pc = matrix.computePrincipalComponents(K)
val projected = matrix.multiply(pc)
println(projected.numRows, projected.numCols)
println(projected.rows.take().mkString("\n"))
val ps = projected.rows.take().map{ case ( x ) =>
i = i+
val y= x.toArray
val address = t(i)
(i, y, address)
}
val itemFactor =projected.rows.take().map{case ( x ) =>
val y= x.toArray
(y)
}
val itemVector = new DoubleMatrix(itemFactor)
def cosineSimilarity(vec1: DoubleMatrix, vec2: DoubleMatrix): Double = {
vec1.dot(vec2) / (vec1.norm2() * vec2.norm2())
}
val sims = ps.map{ case (id, factor, add) =>
val factorVector = new DoubleMatrix(factor)
val sim = cosineSimilarity(factorVector, itemVector)
(id, sim, add)
}
println(sims.take().mkString("\n"))
val sortedSims = sims.sortBy(-_._2)
println()
println("推荐:")
println(sortedSims.slice(,)mkString("\n"))
Spark MLlib - LFW的更多相关文章
- 《Spark MLlib机器学习实践》内容简介、目录
http://product.dangdang.com/23829918.html Spark作为新兴的.应用范围最为广泛的大数据处理开源框架引起了广泛的关注,它吸引了大量程序设计和开发人员进行相 ...
- Spark MLlib 之 Basic Statistics
Spark MLlib提供了一些基本的统计学的算法,下面主要说明一下: 1.Summary statistics 对于RDD[Vector]类型,Spark MLlib提供了colStats的统计方法 ...
- Spark MLlib Data Type
MLlib 支持存放在单机上的本地向量和矩阵,也支持通过多个RDD实现的分布式矩阵.因此MLlib的数据类型主要分为两大类:一个是本地单机向量:另一个是分布式矩阵.下面分别介绍一下这两大类都有哪些类型 ...
- Spark MLlib - Decision Tree源码分析
http://spark.apache.org/docs/latest/mllib-decision-tree.html 以决策树作为开始,因为简单,而且也比较容易用到,当前的boosting或ran ...
- Spark入门实战系列--8.Spark MLlib(上)--机器学习及SparkMLlib简介
[注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .机器学习概念 1.1 机器学习的定义 在维基百科上对机器学习提出以下几种定义: l“机器学 ...
- Spark入门实战系列--8.Spark MLlib(下)--机器学习库SparkMLlib实战
[注]该系列文章以及使用到安装包/测试数据 可以在<倾情大奉送--Spark入门实战系列>获取 .MLlib实例 1.1 聚类实例 1.1.1 算法说明 聚类(Cluster analys ...
- spark mllib配置pom.xml错误 Multiple markers at this line Could not transfer artifact net.sf.opencsv:opencsv:jar:2.3 from/to central (https://repo.maven.apache.org/maven2): repo.maven.apache.org
刚刚spark mllib,在maven repository网站http://mvnrepository.com/中查询mllib后得到相关库的最新dependence为: <dependen ...
- Apache Spark源码走读之23 -- Spark MLLib中拟牛顿法L-BFGS的源码实现
欢迎转载,转载请注明出处,徽沪一郎. 概要 本文就拟牛顿法L-BFGS的由来做一个简要的回顾,然后就其在spark mllib中的实现进行源码走读. 拟牛顿法 数学原理 代码实现 L-BFGS算法中使 ...
- spark Mllib基本功系列编程入门之 SVM实现分类
话不多说.直接上代码咯.欢迎交流. /** * Created by whuscalaman on 1/7/16. */import org.apache.spark.{SparkConf, Spar ...
随机推荐
- 基于WebGL 的3D呈现A* Search Algorithm
http://www.hightopo.com/demo/astar/astar.html 最近搞个游戏遇到最短路径的常规游戏问题,一时起兴基于HT for Web写了个A*算法的WebGL 3D呈现 ...
- asp.net core 1.1 升级后,操作mysql出错的解决办法。
遇到问题 core的版本从1.0升级到1.1,操作mysql数据库,查询数据时遇到MissingMethodException问题,更新.插入操作没有问题. 如果你也遇到这个问题,请参照以下步骤进行升 ...
- WinForm拖动没有标题栏窗体的方法
建立窗体的名称修改为:Form_HoverTree 文后附有源码下载. 主要代码: Point _HoverTreePosition; public Form_HoverTree() { Initia ...
- java访问修饰符
了解面向对象思想的同学们,都知道"封装"这一基本特征,如何正确运用访问修饰符,恰恰能体现出封装的好坏. java访问修饰符有四个: 1)public:访问权限最高,其修饰的类.类变 ...
- 新手入门JUnit单元测试
首先将JUnit插件安装到Eclipse或myeclipse里面,编写完一个模块或者实体类的时候,直接右击,new一个JUnit项目,选择你想测试的实体类(模块),然后会自动生成一个类,这个类,我们将 ...
- 每次新建项目出现appcompat_v7 解决方法
ADT升级版本后每次新建项目出现appcompat_v7 , 解决方案如下 问题生成:
- java文档注释--javadoc的用法
1.前言 Java中有三种注释方式.前两种分别是 // 和 /* */,主要用于代码的注释,以此来方便代码的可读性.第三种被称作说明注释或文档注释,它以 /** 开始,以 */结束,文档注释允许你在程 ...
- Linux应用程序基础
文件位置: 系统命令:/bin和sbin目录,或shell内部指令: 应用程序:/usr/bin和/usr/sbin目录. /usr/bin:普通执行程序文件: ...
- 利用H5和ChromiumWebBrowser构建应用
chromium是google chrome浏览器所采用的内核,最开始由苹果的webkit发展而出,由于webkit在发展上存在分歧,而google希望在开发上有更大的自由度,2013年google决 ...
- Javascript高性能编程-提高javascript加载速度
1.将所有<script>标签放在尽可能接近<body>标签底部的位置,以保证页面在脚本运行之前完成解析尽量减少对整个页面下载的影响 2.限制页面的<sc ...