/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/ package org.apache.spark.graphx import scala.language.implicitConversions
import scala.reflect.ClassTag import org.apache.spark.graphx.impl._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel /**
* The Graph abstractly represents a graph with arbitrary objects
* associated with vertices and edges. The graph provides basic
* operations to access and manipulate the data associated with
* vertices and edges as well as the underlying structure. Like Spark
* RDDs, the graph is a functional data-structure in which mutating
* operations return new graphs.
*
* @note [[GraphOps]] contains additional convenience operations and graph algorithms.
*
* @tparam VD the vertex attribute type
* @tparam ED the edge attribute type
*/
abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializable { /**
* An RDD containing the vertices and their associated attributes.
*
* @note vertex ids are unique.
* @return an RDD containing the vertices in this graph
*/
@transient val vertices: VertexRDD[VD] /**
* An RDD containing the edges and their associated attributes. The entries in the RDD contain
* just the source id and target id along with the edge data.
*
* @return an RDD containing the edges in this graph
*
* @see [[Edge]] for the edge type.
* @see [[triplets]] to get an RDD which contains all the edges
* along with their vertex data.
*
*/
@transient val edges: EdgeRDD[ED, VD] /**
* An RDD containing the edge triplets, which are edges along with the vertex data associated with
* the adjacent vertices. The caller should use [[edges]] if the vertex data are not needed, i.e.
* if only the edge data and adjacent vertex ids are needed.
*
* @return an RDD containing edge triplets
*
* @example This operation might be used to evaluate a graph
* coloring where we would like to check that both vertices are a
* different color.
* {{{
* type Color = Int
* val graph: Graph[Color, Int] = GraphLoader.edgeListFile("hdfs://file.tsv")
* val numInvalid = graph.triplets.map(e => if (e.src.data == e.dst.data) 1 else 0).sum
* }}}
*/
@transient val triplets: RDD[EdgeTriplet[VD, ED]] /**
* Caches the vertices and edges associated with this graph at the specified storage level,
* ignoring any target storage levels previously set.
*
* @param newLevel the level at which to cache the graph.
*
* @return A reference to this graph for convenience.
*/
def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] /**
* Caches the vertices and edges associated with this graph at the previously-specified target
* storage levels, which default to `MEMORY_ONLY`. This is used to pin a graph in memory enabling
* multiple queries to reuse the same construction process.
*/
def cache(): Graph[VD, ED] /**
* Uncaches only the vertices of this graph, leaving the edges alone. This is useful in iterative
* algorithms that modify the vertex attributes but reuse the edges. This method can be used to
* uncache the vertex attributes of previous iterations once they are no longer needed, improving
* GC performance.
*/
def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] /**
* Repartitions the edges in the graph according to `partitionStrategy`.
*
* @param partitionStrategy the partitioning strategy to use when partitioning the edges
* in the graph.
*/
def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] /**
* Repartitions the edges in the graph according to `partitionStrategy`.
*
* @param partitionStrategy the partitioning strategy to use when partitioning the edges
* in the graph.
* @param numPartitions the number of edge partitions in the new graph.
*/
def partitionBy(partitionStrategy: PartitionStrategy, numPartitions: Int): Graph[VD, ED] /**
* Transforms each vertex attribute in the graph using the map function.
*
* @note The new graph has the same structure. As a consequence the underlying index structures
* can be reused.
*
* @param map the function from a vertex object to a new vertex value
*
* @tparam VD2 the new vertex data type
*
* @example We might use this operation to change the vertex values
* from one type to another to initialize an algorithm.
* {{{
* val rawGraph: Graph[(), ()] = Graph.textFile("hdfs://file")
* val root = 42
* var bfsGraph = rawGraph.mapVertices[Int]((vid, data) => if (vid == root) 0 else Math.MaxValue)
* }}}
*
*/
def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2)
(implicit eq: VD =:= VD2 = null): Graph[VD2, ED] /**
* Transforms each edge attribute in the graph using the map function. The map function is not
* passed the vertex value for the vertices adjacent to the edge. If vertex values are desired,
* use `mapTriplets`.
*
* @note This graph is not changed and that the new graph has the
* same structure. As a consequence the underlying index structures
* can be reused.
*
* @param map the function from an edge object to a new edge value.
*
* @tparam ED2 the new edge data type
*
* @example This function might be used to initialize edge
* attributes.
*
*/
def mapEdges[ED2: ClassTag](map: Edge[ED] => ED2): Graph[VD, ED2] = {
mapEdges((pid, iter) => iter.map(map))
} /**
* Transforms each edge attribute using the map function, passing it a whole partition at a
* time. The map function is given an iterator over edges within a logical partition as well as
* the partition's ID, and it should return a new iterator over the new values of each edge. The
* new iterator's elements must correspond one-to-one with the old iterator's elements. If
* adjacent vertex values are desired, use `mapTriplets`.
*
* @note This does not change the structure of the
* graph or modify the values of this graph. As a consequence
* the underlying index structures can be reused.
*
* @param map a function that takes a partition id and an iterator
* over all the edges in the partition, and must return an iterator over
* the new values for each edge in the order of the input iterator
*
* @tparam ED2 the new edge data type
*
*/
def mapEdges[ED2: ClassTag](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2])
: Graph[VD, ED2] /**
* Transforms each edge attribute using the map function, passing it the adjacent vertex
* attributes as well. If adjacent vertex values are not required,
* consider using `mapEdges` instead.
*
* @note This does not change the structure of the
* graph or modify the values of this graph. As a consequence
* the underlying index structures can be reused.
*
* @param map the function from an edge object to a new edge value.
*
* @tparam ED2 the new edge data type
*
* @example This function might be used to initialize edge
* attributes based on the attributes associated with each vertex.
* {{{
* val rawGraph: Graph[Int, Int] = someLoadFunction()
* val graph = rawGraph.mapTriplets[Int]( edge =>
* edge.src.data - edge.dst.data)
* }}}
*
*/
def mapTriplets[ED2: ClassTag](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] = {
mapTriplets((pid, iter) => iter.map(map))
} /**
* Transforms each edge attribute a partition at a time using the map function, passing it the
* adjacent vertex attributes as well. The map function is given an iterator over edge triplets
* within a logical partition and should yield a new iterator over the new values of each edge in
* the order in which they are provided. If adjacent vertex values are not required, consider
* using `mapEdges` instead.
*
* @note This does not change the structure of the
* graph or modify the values of this graph. As a consequence
* the underlying index structures can be reused.
*
* @param map the iterator transform
*
* @tparam ED2 the new edge data type
*
*/
def mapTriplets[ED2: ClassTag](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2])
: Graph[VD, ED2] /**
* Reverses all edges in the graph. If this graph contains an edge from a to b then the returned
* graph contains an edge from b to a.
*/
def reverse: Graph[VD, ED] /**
* Restricts the graph to only the vertices and edges satisfying the predicates. The resulting
* subgraph satisifies
*
* {{{
* V' = {v : for all v in V where vpred(v)}
* E' = {(u,v): for all (u,v) in E where epred((u,v)) && vpred(u) && vpred(v)}
* }}}
*
* @param epred the edge predicate, which takes a triplet and
* evaluates to true if the edge is to remain in the subgraph. Note
* that only edges where both vertices satisfy the vertex
* predicate are considered.
*
* @param vpred the vertex predicate, which takes a vertex object and
* evaluates to true if the vertex is to be included in the subgraph
*
* @return the subgraph containing only the vertices and edges that
* satisfy the predicates
*/
def subgraph(
epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
vpred: (VertexId, VD) => Boolean = ((v, d) => true))
: Graph[VD, ED] /**
* Restricts the graph to only the vertices and edges that are also in `other`, but keeps the
* attributes from this graph.
* @param other the graph to project this graph onto
* @return a graph with vertices and edges that exist in both the current graph and `other`,
* with vertex and edge data from the current graph
*/
def mask[VD2: ClassTag, ED2: ClassTag](other: Graph[VD2, ED2]): Graph[VD, ED] /**
* Merges multiple edges between two vertices into a single edge. For correct results, the graph
* must have been partitioned using [[partitionBy]].
*
* @param merge the user-supplied commutative associative function to merge edge attributes
* for duplicate edges.
*
* @return The resulting graph with a single edge for each (source, dest) vertex pair.
*/
def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] /**
* Aggregates values from the neighboring edges and vertices of each vertex. The user supplied
* `mapFunc` function is invoked on each edge of the graph, generating 0 or more "messages" to be
* "sent" to either vertex in the edge. The `reduceFunc` is then used to combine the output of
* the map phase destined to each vertex.
*
* @tparam A the type of "message" to be sent to each vertex
*
* @param mapFunc the user defined map function which returns 0 or
* more messages to neighboring vertices
*
* @param reduceFunc the user defined reduce function which should
* be commutative and associative and is used to combine the output
* of the map phase
*
* @param activeSetOpt optionally, a set of "active" vertices and a direction of edges to
* consider when running `mapFunc`. If the direction is `In`, `mapFunc` will only be run on
* edges with destination in the active set. If the direction is `Out`,
* `mapFunc` will only be run on edges originating from vertices in the active set. If the
* direction is `Either`, `mapFunc` will be run on edges with *either* vertex in the active set
* . If the direction is `Both`, `mapFunc` will be run on edges with *both* vertices in the
* active set. The active set must have the same index as the graph's vertices.
*
* @example We can use this function to compute the in-degree of each
* vertex
* {{{
* val rawGraph: Graph[(),()] = Graph.textFile("twittergraph")
* val inDeg: RDD[(VertexId, Int)] =
* mapReduceTriplets[Int](et => Iterator((et.dst.id, 1)), _ + _)
* }}}
*
* @note By expressing computation at the edge level we achieve
* maximum parallelism. This is one of the core functions in the
* Graph API in that enables neighborhood level computation. For
* example this function can be used to count neighbors satisfying a
* predicate or implement PageRank.
*
*/
def mapReduceTriplets[A: ClassTag](
mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
reduceFunc: (A, A) => A,
activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None)
: VertexRDD[A] /**
* Joins the vertices with entries in the `table` RDD and merges the results using `mapFunc`. The
* input table should contain at most one entry for each vertex. If no entry in `other` is
* provided for a particular vertex in the graph, the map function receives `None`.
*
* @tparam U the type of entry in the table of updates
* @tparam VD2 the new vertex value type
*
* @param other the table to join with the vertices in the graph.
* The table should contain at most one entry for each vertex.
* @param mapFunc the function used to compute the new vertex values.
* The map function is invoked for all vertices, even those
* that do not have a corresponding entry in the table.
*
* @example This function is used to update the vertices with new values based on external data.
* For example we could add the out-degree to each vertex record:
*
* {{{
* val rawGraph: Graph[_, _] = Graph.textFile("webgraph")
* val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees
* val graph = rawGraph.outerJoinVertices(outDeg) {
* (vid, data, optDeg) => optDeg.getOrElse(0)
* }
* }}}
*/
def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
(mapFunc: (VertexId, VD, Option[U]) => VD2)(implicit eq: VD =:= VD2 = null)
: Graph[VD2, ED] /**
* The associated [[GraphOps]] object.
*/
// Save a copy of the GraphOps object so there is always one unique GraphOps object
// for a given Graph object, and thus the lazy vals in GraphOps would work as intended.
val ops = new GraphOps(this)
} // end of Graph /**
* The Graph object contains a collection of routines used to construct graphs from RDDs.
*/
object Graph { /**
* Construct a graph from a collection of edges encoded as vertex id pairs.
*
* @param rawEdges a collection of edges in (src, dst) form
* @param defaultValue the vertex attributes with which to create vertices referenced by the edges
* @param uniqueEdges if multiple identical edges are found they are combined and the edge
* attribute is set to the sum. Otherwise duplicate edges are treated as separate. To enable
* `uniqueEdges`, a [[PartitionStrategy]] must be provided.
* @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
* @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
*
* @return a graph with edge attributes containing either the count of duplicate edges or 1
* (if `uniqueEdges` is `None`) and vertex attributes containing the total degree of each vertex.
*/
def fromEdgeTuples[VD: ClassTag](
rawEdges: RDD[(VertexId, VertexId)],
defaultValue: VD,
uniqueEdges: Option[PartitionStrategy] = None,
edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, Int] =
{
val edges = rawEdges.map(p => Edge(p._1, p._2, ))
val graph = GraphImpl(edges, defaultValue, edgeStorageLevel, vertexStorageLevel)
uniqueEdges match {
case Some(p) => graph.partitionBy(p).groupEdges((a, b) => a + b)
case None => graph
}
} /**
* Construct a graph from a collection of edges.
*
* @param edges the RDD containing the set of edges in the graph
* @param defaultValue the default vertex attribute to use for each vertex
* @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
* @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
*
* @return a graph with edge attributes described by `edges` and vertices
* given by all vertices in `edges` with value `defaultValue`
*/
def fromEdges[VD: ClassTag, ED: ClassTag](
edges: RDD[Edge[ED]],
defaultValue: VD,
edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] = {
GraphImpl(edges, defaultValue, edgeStorageLevel, vertexStorageLevel)
} /**
* Construct a graph from a collection of vertices and
* edges with attributes. Duplicate vertices are picked arbitrarily and
* vertices found in the edge collection but not in the input
* vertices are assigned the default attribute.
*
* @tparam VD the vertex attribute type
* @tparam ED the edge attribute type
* @param vertices the "set" of vertices and their attributes
* @param edges the collection of edges in the graph
* @param defaultVertexAttr the default vertex attribute to use for vertices that are
* mentioned in edges but not in vertices
* @param edgeStorageLevel the desired storage level at which to cache the edges if necessary
* @param vertexStorageLevel the desired storage level at which to cache the vertices if necessary
*/
def apply[VD: ClassTag, ED: ClassTag](
vertices: RDD[(VertexId, VD)],
edges: RDD[Edge[ED]],
defaultVertexAttr: VD = null.asInstanceOf[VD],
edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] = {
GraphImpl(vertices, edges, defaultVertexAttr, edgeStorageLevel, vertexStorageLevel)
} /**
* Implicitly extracts the [[GraphOps]] member from a graph.
*
* To improve modularity the Graph type only contains a small set of basic operations.
* All the convenience operations are defined in the [[GraphOps]] class which may be
* shared across multiple graph implementations.
*/
implicit def graphToGraphOps[VD: ClassTag, ED: ClassTag]
(g: Graph[VD, ED]): GraphOps[VD, ED] = g.ops
} // end of Graph object

Spark之GraphX的Graph_scala学习的更多相关文章

  1. Spark的Rpct模块的学习

    Spark的Rpct模块的学习 Spark的Rpc模块是1.x重构出来可,以前的代码中大量使用了akka的类,为了把akka从项目的依赖中移除,所有添加了该模块.先看下该模块的几个主要的类   使用E ...

  2. StreamDM:基于Spark Streaming、支持在线学习的流式分析算法引擎

    StreamDM:基于Spark Streaming.支持在线学习的流式分析算法引擎 streamDM:Data Mining for Spark Streaming,华为诺亚方舟实验室开源了业界第一 ...

  3. 【Todo】【读书笔记】大数据Spark企业级实战版 & Scala学习

    下了这本<大数据Spark企业级实战版>, 另外还有一本<Spark大数据处理:技术.应用与性能优化(全)> 先看前一篇. 根据书里的前言里面,对于阅读顺序的建议.先看最后的S ...

  4. Spark的Streaming和Spark的SQL简单入门学习

    1.Spark Streaming是什么? a.Spark Streaming是什么? Spark Streaming类似于Apache Storm,用于流式数据的处理.根据其官方文档介绍,Spark ...

  5. 【福利】送Spark大数据平台视频学习资料

    没有套路真的是送!! 大家都知道,大数据行业spark很重要,那话我就不多说了,贴心的大叔给你找了份spark的资料.   多啰嗦两句,一个好的程序猿的基本素养是学习能力和自驱力.视频给了你们,能不能 ...

  6. Spark (Python版) 零基础学习笔记(一)—— 快速入门

    由于Scala才刚刚开始学习,还是对python更为熟悉,因此在这记录一下自己的学习过程,主要内容来自于spark的官方帮助文档,这一节的地址为: http://spark.apache.org/do ...

  7. 原创:Spark中GraphX图运算pregel详解

    由于本人文字表达能力不足,还是多多以代码形式表述,首先展示测试代码,然后解释: package com.txq.spark.test import org.apache.spark.graphx.ut ...

  8. Spark应用的结构的学习

    关注公众号:分享电脑学习回复"百度云盘" 可以免费获取所有学习文档的代码(不定期更新) 承接上一篇文档<Standalone集群搭建和Spark应用监控> 需要了解的概 ...

  9. Spark Streaming官方文档学习--下

    Accumulators and Broadcast Variables 这些不能从checkpoint重新恢复 如果想启动检查点的时候使用这两个变量,就需要创建这写变量的懒惰的singleton实例 ...

随机推荐

  1. 手机连得上WIFI,电脑连不上的情况

    可以搜到,密码也对,但就是连不上,这时候可能就是你的设置错了. 操作步骤以下: 右击我的电脑-->管理-->设备管理器-->网络适配器-->找到你wifi对应的那个名称(如果不 ...

  2. Java16-java语法基础——异常

    Java16-java语法基础——异常 一.异常概念 1.异常:应用程序在运行过程中出现的错误或非正常的意外情况,即虚拟机的通常操作中可能遇到的异常,是一种常见的运行错误. 2.原因:数组越界.空指针 ...

  3. vi/vim 文字处理器常用命令

    目录 vi 与vim vi 的三种模式 vi 光标移动 vi 搜索与替换 vi 删除 vi 复制 vi 粘贴 vi 其他 vi 进入编辑模式 vi 命令行命令 vim 附加功能 vi 与vim vi是 ...

  4. win10下使用wget

    一.下载 官网:http://gnuwin32.sourceforge.net/packages/wget.htm 下载地址:http://downloads.sourceforge.net/gnuw ...

  5. [杂谈]杂谈章2 eclipse没有(添加)“Dynamic Web Project”

    原因:你安装的是专门开发java项目的,而Dynamic Web Project  属于J2EE技术 第一种方法: 你要专门下载一个集成了J2EE插件的Eclipse,到eclipse官网下载相对应版 ...

  6. SignalR 服务器系统配置要求

    SignalR 所支持的服务器版本..NET Framework 版本.IIS和其他组件. SignalR操作系统要求 SignalR组件能够运行在下面的服务器和客户端操作系统.需要注意的是使用Web ...

  7. 安装php7.2并且整合nginx且分开部署

    1.安装php 7.2 2.php配置 3.nginx配置 4.测试 5.报错与解决 6.利用upstream实现负载均衡 1.安装php 7.2 启动容器: liwangdeMacBook-Air: ...

  8. stm32手册上的英文

    crystal-less 无晶振 USB  FS(Full-speed)此外还有High-speed接口(简称HS),Low-speed接口(简称LS) frequency频率 CRC(Cyclic ...

  9. 前端vue框架 路由的安装及使用

    安装: 1.cmd下输入: npm install vue-router --save //安装路由 2.npm run dev //重新启动 使用: 1.在mian.js下引入路由 import V ...

  10. 注册Docker官网账号 注册按钮不能点

    出现如下问题:注册按钮不能点,解决办法,如下 关于docker hub上不能注册dockeID的问题 注意的是,google访问助手,用在线安装,360安全浏览器,再重启下该浏览器,省得装插件.