spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解（java、python、r和scala）之Basic包下的JavaPageRank.java（图文详解）

不多说，直接上干货！

spark-1.6.1-bin-hadoop2.6里Basic包下的JavaPageRank.java

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

//package org.apache.spark.examples;

package zhouls.bigdata.Basic;

import scala.Tuple2;//scala里的元组

import com.google.common.collect.Iterables;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.api.java.function.Function2;

import org.apache.spark.api.java.function.PairFlatMapFunction;

import org.apache.spark.api.java.function.PairFunction;

import java.util.ArrayList;

import java.util.List;

import java.util.Iterator;

import java.util.regex.Pattern;

/**

 * Computes the PageRank of URLs from an input file. Input file should

 * be in format of:

 * URL         neighbor URL

 * URL         neighbor URL

 * URL         neighbor URL

 * ...

 * where URL and their neighbors are separated by space(s).

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.graphx.lib.PageRank

 */

public final class JavaPageRank {

  private static final Pattern SPACES = Pattern.compile("\\s+");

  /*

   * 显示警告函数

   */

  static void showWarning() {

    String warning = "WARN: This is a naive implementation of PageRank " +

            "and is given as an example! \n" +

            "Please use the PageRank implementation found in " +

            "org.apache.spark.graphx.lib.PageRank for more conventional use.";

    System.err.println(warning);

  }

  private static class Sum implements Function2<Double, Double, Double> {

    @Override

    public Double call(Double a, Double b) {

      return a + b;

    }

  }

  /*

   * 主函数

   */

  public static void main(String[] args) throws Exception {

    if (args.length < ) {

      System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");

      System.exit();

    }

    showWarning();

    SparkConf sparkConf = new SparkConf().setAppName("JavaPageRank").setMaster("local");

    JavaSparkContext ctx = new JavaSparkContext(sparkConf);

    // Loads in input file. It should be in format of:

    //     URL         neighbor URL

    //     URL         neighbor URL

    //     URL         neighbor URL

    //     ...

//  JavaRDD<String> lines = ctx.textFile(args[0], 1);//这是官网发行包里写的

    JavaRDD<String> lines = ctx.textFile("data/input/mllib/pagerank_data.txt", );

    // Loads all URLs from input file and initialize their neighbors.

    //根据边关系数据生成 邻接表 如：(1,(2,3,4,5)) (2,(1,5))...

    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(new PairFunction<String, String, String>() {

      @Override

      public Tuple2<String, String> call(String s) {

        String[] parts = SPACES.split(s);

        return new Tuple2<String, String>(parts[], parts[]);

      }

    }).distinct().groupByKey().cache();

    //初始化 ranks, 每一个url初始分值为1

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.

    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {

      @Override

      public Double call(Iterable<String> rs) {

        return 1.0;

      }

    });

    /*

     * 迭代iters次； 每次迭代中做如下处理， links（urlKey, neighborUrls） join (urlKey, rank(分值))；

     * 对neighborUrls以及初始 rank，每一个neighborUrl  , neighborUrlKey, 初始rank/size(新的rank贡献值)；

     * 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _

     */

    // Calculates and updates URL ranks continuously using PageRank algorithm.

    for (int current = ; current < Integer.parseInt(args[]); current++) {

      // Calculates URL contributions to the rank of other URLs.

      JavaPairRDD<String, Double> contribs = links.join(ranks).values()

        .flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {

          @Override

          public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {

            int urlCount = Iterables.size(s._1);

            List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();

            for (String n : s._1) {

              results.add(new Tuple2<String, Double>(n, s._2() / urlCount));

            }

            return results;

          }

      });

      // Re-calculates URL ranks based on neighbor contributions.

      ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {

        @Override

        public Double call(Double sum) {

          return 0.15 + sum * 0.85;

        }

      });

    }

    //输出排名

    // Collects all URL ranks and dump them to console.

    List<Tuple2<String, Double>> output = ranks.collect();

    for (Tuple2<?,?> tuple : output) {

        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");

    }

    ctx.stop();

  }

}

　　没结果，暂时

spark-2.2.0-bin-hadoop2.6里Basic包下的JavaPageRank.java

/*

 * Licensed to the Apache Software Foundation (ASF) under one or more

 * contributor license agreements.  See the NOTICE file distributed with

 * this work for additional information regarding copyright ownership.

 * The ASF licenses this file to You under the Apache License, Version 2.0

 * (the "License"); you may not use this file except in compliance with

 * the License.  You may obtain a copy of the License at

 *

 *    http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an "AS IS" BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */

//package org.apache.spark.examples;

package zhouls.bigdata.Basic;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Pattern;

import scala.Tuple2;

import com.google.common.collect.Iterables;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.function.Function2;

import org.apache.spark.sql.SparkSession;    

/**

 * Computes the PageRank of URLs from an input file. Input file should

 * be in format of:

 * URL         neighbor URL

 * URL         neighbor URL

 * URL         neighbor URL

 * ...

 * where URL and their neighbors are separated by space(s).

 *

 * This is an example implementation for learning how to use Spark. For more conventional use,

 * please refer to org.apache.spark.graphx.lib.PageRank

 *

 * Example Usage:

 * <pre>

 * bin/run-example JavaPageRank data/mllib/pagerank_data.txt 10

 * </pre>

 */

public final class JavaPageRank {

  private static final Pattern SPACES = Pattern.compile("\\s+");

  /*

   * 显示警告函数

   */

  static void showWarning() {

    String warning = "WARN: This is a naive implementation of PageRank " +

            "and is given as an example! \n" +

            "Please use the PageRank implementation found in " +

            "org.apache.spark.graphx.lib.PageRank for more conventional use.";

    System.err.println(warning);

  }

  private static class Sum implements Function2<Double, Double, Double> {

    @Override

    public Double call(Double a, Double b) {

      return a + b;

    }

  }

  /*

   * 主函数

   */

  public static void main(String[] args) throws Exception {

    if (args.length < ) {

      System.err.println("Usage: JavaPageRank <file> <number_of_iterations>");

      System.exit();

    }

    showWarning();

    SparkSession spark = SparkSession

      .builder()

      .master("local")

      .appName("JavaPageRank")

      .getOrCreate();

    // Loads in input file. It should be in format of:

    //     URL         neighbor URL

    //     URL         neighbor URL

    //     URL         neighbor URL

    //     ...

//  JavaRDD<String> lines = spark.read().textFile(args[0]).javaRDD();

    JavaRDD<String> lines = spark.read().textFile("data/input/mllib/pagerank_data.txt").javaRDD();

    // Loads all URLs from input file and initialize their neighbors.

    //根据边关系数据生成 邻接表 如：(1,(2,3,4,5)) (2,(1,5))...

    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {

      String[] parts = SPACES.split(s);

      return new Tuple2<>(parts[], parts[]);

    }).distinct().groupByKey().cache();

    // Loads all URLs with other URL(s) link to from input file and initialize ranks of them to one.

    //初始化 ranks, 每一个url初始分值为1

    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);

    /*

     * 迭代iters次； 每次迭代中做如下处理， links（urlKey, neighborUrls） join (urlKey, rank(分值))；

     * 对neighborUrls以及初始 rank，每一个neighborUrl  , neighborUrlKey, 初始rank/size(新的rank贡献值)；

     * 然后再进行reduceByKey相加 并对分值 做调整 0.15 + 0.85 * _

     */

    // Calculates and updates URL ranks continuously using PageRank algorithm.

    for (int current = ; current < Integer.parseInt(args[]); current++) {

      // Calculates URL contributions to the rank of other URLs.

      JavaPairRDD<String, Double> contribs = links.join(ranks).values()

        .flatMapToPair(s -> {

          int urlCount = Iterables.size(s._1());

          List<Tuple2<String, Double>> results = new ArrayList<>();

          for (String n : s._1) {

            results.add(new Tuple2<>(n, s._2() / urlCount));

          }

          return results.iterator();

        });

      // Re-calculates URL ranks based on neighbor contributions.

      ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);

    }

    //输出排名

    // Collects all URL ranks and dump them to console.

    List<Tuple2<String, Double>> output = ranks.collect();

    for (Tuple2<?,?> tuple : output) {

      System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");

    }

    spark.stop();

  }

}

　　没结果，暂时

spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解（java、python、r和scala）之Basic包下的JavaPageRank.java（图文详解）的更多相关文章

(转)CentOS 6下配置软RAID图文详解
CentOS 6下配置软RAID图文详解原文:http://blog.51cto.com/hujiangtao/1929620 一.RAID 简介 RAID 是英文Redundant Array o ...
反射实现Model修改前后的内容对比【API调用】腾讯云短信 Windows操作系统下Redis服务安装图文详解 Redis入门学习
反射实现Model修改前后的内容对比在开发过程中,我们会遇到这样一个问题,编辑了一个对象之后,我们想要把这个对象修改了哪些内容保存下来,以便将来查看和追责. 首先我们要创建一个User类 1 p ...
java扫描某个包下的所有java类并加载
最近在学习java的反射和注解,实际情景中需要扫描某个包下的所有java类,然后使用类加载器加载类. 基本思路,获得程序的路径扫描src下某个包内的子包和java类,实现也比较简单. 运行环境:win ...
spark最新源码下载并导入到开发环境下助推高质量代码(Scala IDEA for Eclipse和IntelliJ IDEA皆适用）（以spark2.2.0源码包为例）（图文详解）
不多说,直接上干货! 前言其实啊,无论你是初学者还是具备了有一定spark编程经验,都需要对spark源码足够重视起来. 本人,肺腑之己见,想要成为大数据的大牛和顶尖专家,多结合源码和操练编程. ...
如何用R来处理数据表的长宽转换（图文详解）
不多说,直接上干货! 很多地方都需用到这个知识点,比如Tableau里. 通常可以采取如python 和 r来作为数据处理的前期. Tableau学习系列之Tableau如何通过数据透视表方式读取 ...
iOS使用Charles（青花瓷）抓包并篡改返回数据图文详解
写本文的契机主要是前段时间有次用青花瓷抓包有一步忘了,在网上查了半天也没找到写的完整的教程,于是待问题解决后抽时间截了图,自己写一遍封存在博客园中以便以后随时查阅. charles又名青花瓷,在iOS ...
java.util.regex包下的Pattern和Matcher详解（正则匹配）
java正则表达式通过java.util.regex包下的Pattern类与Matcher类实现(建议在阅读本文时,打开java API文档,当介绍到哪个方法时,查看java API中的方法说明,效果 ...
Java并发机制（8）--concurrent包下辅助类的使用
Java并发编程:concurrent包下辅助类的使用整理自:博客园-海子-http://www.cnblogs.com/dolphin0520/p/3920397.html 1.CountDown ...
执行Hive时出现org.apache.hadoop.util.RunJar.main(RunJar.java:136) Caused by: java.lang.NumberFormatException: For input string: "1s"错误的解决办法（图文详解）
不多说,直接上干货问题详情 [kfk@bigdata-pro01 apache-hive--bin]$ bin/hive Logging initialized -bin/conf/hive-log ...

随机推荐

arm-linux-gcc4.4.3编译busybox-1.25.0
系统环境: 1.操作系统:Ubuntu16.04 2.交叉编译工具链:arm-linux-gcc4.4.3 3.busybox源码包:busybox-1.25.0 一.修改Makefile配置首先解 ...
BZOJ1855 [Scoi2010]股票交易[单调队列dp]
题题面有点复杂,不概括了. 后面的状态有前面的最优解获得大致方向是dp.先是瞎想了个$f[i][j]$表示第$i$天手里有$j$张股票时最大收入(当天无所谓买不买). 然后写了一个$O(n^4)$状 ...
ACM学习历程——HDU 5014 Number Sequence （贪心）（2014西安网赛）
Description There is a special number sequence which has n+1 integers. For each number in sequence, ...
dataguard类型转换与模式转化
修改数据保护模式步骤前提:是否满足转换模式的配置要求最大保护(Maximum Protection):Standby Database 必须配置Standby Redo Log,Primary D ...
bzoj 1004 Cards & poj 2409 Let it Bead —— 置换群
题目:https://www.lydsy.com/JudgeOnline/problem.php?id=1004 关于置换群:https://www.cnblogs.com/nietzsche-oie ...
中国移动推出NB-IoT/eMTC/GSM多模通信模组Qualcomm调制解调器支持
亚洲电子消费展(CES Asia)在上海举行.期间,中国移动正式推出NB-IoT/eMTC/GSM三模通信模组A9500.该通信模组采用Qualcomm MDM9206 LTE IoT调制解调器,具有 ...
cassandra迁移表数据
cassandra的迁移表数据有2种方式,以keyspace名为mydb,table名为user为例子: 方法一:copy命令. 这种方式适合数据量较小的情况. 1.进入cqlsh,输入命令:COPY ...
面试题: 数据库笔试 sql操作已看上课的练习题50sql
2018/5/31 oracle数据库面试笔试试题总结http://www.yjbys.com/qiuzhizhinan/show-308759.html 1/4Oracle数据库1.基础测试选择在部 ...
mysql 1069 数据库无法启动解决办法
mysql无缘无故的启动不了了. 在控制台里面用root连接,报错10061. 在服务管理里面启动,报错1069. 在网上找了一些解决方法,删除my.ini之类的,都无效.后来在百度经验里面找到了可行 ...
Hash表的实现
#include "stdafx.h" #include <iostream> #include <exception> using namespace s ...

spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解（java、python、r和scala）之Basic包下的JavaPageRank.java（图文详解）

spark-2.2.0-bin-hadoop2.6和spark-1.6.1-bin-hadoop2.6发行包自带案例全面详解（java、python、r和scala）之Basic包下的JavaPageRank.java（图文详解）的更多相关文章

随机推荐

热门专题