HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析

run.sh

#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/sql/scan.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench ScalaSparkScan ${workload_config} ${current_dir}
show_bannar start

# prepare SQL
HIVEBENCH_SQL_FILE=${WORKLOAD_RESULT_FOLDER}/rankings_uservisits_scan.hive
prepare_sql_scan ${HIVEBENCH_SQL_FILE}

START_TIME=`timestamp`
rmr_hdfs $OUTPUT_HDFS
run_spark_job com.intel.hibench.sparkbench.sql.ScalaSparkSQLBench ScalaScan ${HIVEBENCH_SQL_FILE}
END_TIME=`timestamp`

SIZE=`dir_size $OUTPUT_HDFS`
gen_report ${START_TIME} ${END_TIME} ${SIZE:-}
show_bannar finish
leave_bench

workload_functions.sh

function run_spark_job() {
    LIB_JARS=
    while (($#)); do
      if [ "$1" = "--jars" ]; then
        LIB_JARS="--jars $2"

        continue
      fi
      break
    done

    CLS=$
    shift

    export_withlog SPARKBENCH_PROPERTIES_FILES

    YARN_OPTS=""
    if [[ "$SPARK_MASTER" == yarn-* ]]; then
        export_withlog HADOOP_CONF_DIR

        YARN_OPTS="--num-executors ${YARN_NUM_EXECUTORS}"
        if [[ -n "${YARN_EXECUTOR_CORES:-}" ]]; then
            YARN_OPTS="${YARN_OPTS} --executor-cores ${YARN_EXECUTOR_CORES}"
       fi
       if [[ -n "${SPARK_YARN_EXECUTOR_MEMORY:-}" ]]; then
           YARN_OPTS="${YARN_OPTS} --executor-memory ${SPARK_YARN_EXECUTOR_MEMORY}"
       fi
       if [[ -n "${SPAKR_YARN_DRIVER_MEMORY:-}" ]]; then
           YARN_OPTS="${YARN_OPTS} --driver-memory ${SPARK_YARN_DRIVER_MEMORY}"
       fi
    fi
    if [[ "$CLS" == *.py ]]; then
        LIB_JARS="$LIB_JARS --jars ${SPARKBENCH_JAR}"
        SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --master ${SPARK_MASTER} ${YARN_OPTS} ${CLS} $@"
    else
        SUBMIT_CMD="${SPARK_HOME}/bin/spark-submit ${LIB_JARS} --properties-file ${SPARK_PROP_CONF} --class ${CLS} --master ${SPARK_MASTER} ${YARN_OPTS} ${SPARKBENCH_JAR} $@"
    fi
    echo -e "${BGreen}Submit Spark job: ${Green}${SUBMIT_CMD}${Color_Off}"
    MONITOR_PID=`start_monitor`
    execute_withlog ${SUBMIT_CMD}
    result=$?
    stop_monitor ${MONITOR_PID}
     ]
    then
        echo -e "${BRed}ERROR${Color_Off}: Spark job ${BYellow}${CLS}${Color_Off} failed to run successfully."
        echo -e "${BBlue}Hint${Color_Off}: You can goto ${BYellow}${WORKLOAD_RESULT_FOLDER}/bench.log${Color_Off} to check for detailed log.\nOpening log tail for you:\n"
        tail ${WORKLOAD_RESULT_FOLDER}/bench.log
        exit $result
    fi
}

ScalaSparkSQLBench.scala

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.hibench.sparkbench.sql

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext

/*
 * ported from HiBench's hive bench
 */
object ScalaSparkSQLBench{
  def main(args: Array[String]){
    if (args.length < 2){
      System.err.println(
        s"Usage: $ScalaSparkSQLBench <workload name> <SQL sciprt file>"
      )
      System.exit(1)
    }
    val workload_name = args(0)
    val sql_file = args(1)
    val sparkConf = new SparkConf().setAppName(workload_name)
    val sc = new SparkContext(sparkConf)
    val hc = new HiveContext(sc)

    val _sql = scala.io.Source.fromFile(sql_file).mkString
    _sql.split(';').foreach { x =>
      if (x.trim.nonEmpty)
        hc.sql(x)
    }

    sc.stop()
  }
}

HiveData.java

package HiBench;

import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleInputs;
import org.apache.hadoop.mapred.lib.NLineInputFormat;

public class HiveData {

    private static final Log log = LogFactory.getLog(HiveData.class.getName());

    private static final String RANKINGS = "rankings";
    private static final String USERVISITS = "uservisits";
    public static final String uagentf = "user_agents";
    public static final String countryf = "country_codes";
    public static final String searchkeyf = "search_keys";

    private DataOptions options;
    private long visits;

    // client side delim
    private String cdelim = ",";
    private int chashsize = 150 * 1024 * 1024;

    private Dummy dummy;

    HiveData(DataOptions options) {
        this.options = options;
        parseArgs(options.getRemainArgs());
    }

    private void parseArgs(String[] args) {

        for (int i=0; i<args.length; i++) {
            if ("-v".equals(args[i])) {
                visits = Long.parseLong(args[++i]);
            } else if ("-d".equals(args[i])) {
                cdelim = args[++i];
            } else {
                DataOptions.printUsage("Unknown hive data arguments -- " + args[i] + "!!!");
            }
        }

        if (chashsize > options.getNumPages()) {
            chashsize = (int) options.getNumPages();
        }

    }

    private void setRankingsOptions(JobConf job) throws URISyntaxException {
        job.setLong("pages", options.getNumPages());
        job.setLong("slotpages", options.getNumSlotPages());
        job.set("delimiter", cdelim);
        job.setInt("hashsize", chashsize);
        Utils.shareLinkZipfCore(options, job);
    }

    private void setVisitsOptions(JobConf job) {
        job.setInt("slots", options.getNumMaps());
        job.setLong("pages", options.getNumPages());
        job.setLong("visits", visits);
        job.set("delimiter", cdelim);
    }

    public static class DummyToRankingsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {

        private static final Log log = LogFactory.getLog(DummyToRankingsMapper.class.getName());

        private HtmlCore generator;
        private long pages, slotpages;
        private boolean outset;
        private OutputCollector<LongWritable, JoinBytesInt> myout;
        private JoinBytesInt uitem, ritem;
        private short[] hash;
        private HashMap<Integer, Integer> hm;
        private int hashsize;

        private void getOptions(JobConf job) {
            pages = job.getLong("pages", 0);
            slotpages = job.getLong("slotpages", 0);
            hashsize = job.getInt("hashsize", 0);
        }

        public void configure(JobConf job) {

            getOptions(job);

            try {
                generator = new HtmlCore(job);
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }

            outset = false;
            myout = null;
            uitem = new JoinBytesInt();
            uitem.url = new byte[HtmlCore.getMaxUrlLength()];
            ritem = new JoinBytesInt();
            ritem.refs = 1;

            hash = new short[hashsize];
            hm = new HashMap<Integer, Integer>();
        }

        public void map(LongWritable key, Text value, OutputCollector<LongWritable, JoinBytesInt> output,
                Reporter reporter) throws IOException {

            if (!outset) {
                myout = output;
                outset = true;
            }

            int slotId = Integer.parseInt(value.toString().trim());
            generator.fireRandom(slotId);

            long[] range = HtmlCore.getPageRange(slotId, pages, slotpages);

            /**
             * For output collect
             */
            for (long i=range[0]; i<range[1]; i++) {
                key.set(i);

                generator.nextUrlJoinBytesInt(uitem);
                output.collect(key, uitem);

                long[] linkids = generator.genPureLinkIds();
                for (int j=0; j<linkids.length; j++) {
                    long uid = linkids[j];
                    if (uid < hashsize) {
                        int iid = (int) uid;
                        if (hash[iid]>=0) {
                            if (hash[iid]==HtmlCore.MAX_SHORT) {
                                hm.put(iid, (int) (hash[iid]) + 1);
                                hash[iid] = -1;
                            } else {
                                hash[iid]++;
                            }
                        } else {
                            hm.put(iid, hm.get(iid) + 1);
                        }
                    } else {
                        key.set(uid);
                        output.collect(key, ritem);
                    }
                }

                if (0==(i % 10000)) {
                    log.info("still running: " + (i - range[0]) + " of " + slotpages);
                }
            }
        }

        @Override
        public void close ()
        {
            try {
                LongWritable k = new LongWritable();
                for (int i=0; i<hash.length; i++) {
                    if (hash[i] > 0) {
                        k.set(i);
                        ritem.refs = hash[i];
                        myout.collect(k, ritem);
                    } else if (hash[i] < 0) {
                        k.set(i);
                        ritem.refs = hm.get(i);
                        myout.collect(k, ritem);
                    }
                }
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    }

    public static class JoinBytesIntCombiner extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, JoinBytesInt> {

//        Log log = null;
        JoinBytesInt item;

        @Override
        public void configure (JobConf job)
        {
            item = new JoinBytesInt();
//            log = LogFactory.getLog(JoinBytesIntCombiner.class.getName());
        }

        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            item.clear();
//            StringBuffer sb =  new StringBuffer("Combine: " + v.toString());
            while (values.hasNext()) {
                item.add(values.next());
//                sb.append("-> " + v.toString());
            }
            output.collect(key, item);
//            log.info(sb);
        }
    }

    public static class GenerateRankingsReducer extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, Text> {

        private static final Log log = LogFactory.getLog(GenerateRankingsReducer.class.getName());

        private Random rand;
        private int errors, missed;
        private JoinBytesInt v;
        private int pid;

        // job side delimiter
        private String delim;
//        private String missedids;

        public void configure (JobConf job)
        {
            delim = job.get("delimiter");
            pid = job.getInt("mapred.task.partition", 0);
            rand = new Random(pid + 1);

            v = new JoinBytesInt();

            errors = 0;
            missed = 0;
//            missedids = "";
        }

        public void close ()
        {
            log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed");
        }

        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {

            v.clear();
            while (values.hasNext()) {
                v.add(values.next());
            }

            if (0!=v.ulen) {
                if (v.refs > 0) {
                    Text value = new Text(
                            new String(v.url) +
                            delim +
                            v.refs +
                            delim +
                            (rand.nextInt(99) + 1)
                            );
                    output.collect(
                            key, value);

                    reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8+value.getLength());
                } else {
                    missed++;
                }
            } else {
                errors++;
            }
        }
    }

    private void createRankingsTableDirectly() throws IOException, URISyntaxException {

        log.info("Creating table rankings...");

        Path fout = new Path(options.getResultPath(), RANKINGS);

        JobConf job = new JobConf(HiveData.class);
        String jobname = "Create rankings";

        /** TODO: change another more effective way as this operation may cause
         *  about 2 min delay (originally ~15min in total)
         */
        setRankingsOptions(job);
        job.setJobName(jobname);
        job.set("mapred.reduce.slowstart.completed.maps", "0.3");
        job.set("mapreduce.job.reduce.slowstart.completedmaps", "0.3");

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(JoinBytesInt.class);

        job.setJarByClass(DummyToRankingsMapper.class);
        job.setJarByClass(JoinBytesIntCombiner.class);
        job.setJarByClass(GenerateRankingsReducer.class);

        job.setMapperClass(DummyToRankingsMapper.class);
        job.setCombinerClass(JoinBytesIntCombiner.class);
        job.setReducerClass(GenerateRankingsReducer.class);

        if (options.getNumReds() > 0) {
            job.setNumReduceTasks(options.getNumReds());
        } else {
            job.setNumReduceTasks(Utils.getMaxNumReds());
        }

        job.setInputFormat(NLineInputFormat.class);
        FileInputFormat.setInputPaths(job, dummy.getPath());

         job.set("mapred.map.output.compression.type", "BLOCK");
          job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
         MapFileOutputFormat.setCompressOutput(job, true);
//        MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.LzoCodec.class);
         MapFileOutputFormat.setOutputCompressorClass(job, org.apache.hadoop.io.compress.DefaultCodec.class);

        if (options.isSequenceOut()) {
            job.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            job.setOutputFormat(TextOutputFormat.class);
        }

        if (null != options.getCodecClass()) {
            job.set("mapred.output.compression.type","BLOCK");
             job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
        }

        FileOutputFormat.setOutputPath(job, fout);

        log.info("Running Job: " +jobname);
        log.info("Pages file " + dummy.getPath() + " as input");
        log.info("Rankings file " + fout + " as output");
        JobClient.runJob(job);
        log.info("Finished Running Job: " + jobname);
    }

    /***
     * Mapper to randomly create user visits. In map step, only the target
     * urls of user visits are created, the rest content of visits will be
     * created in reduce step
     * @author lyi2
     *
     */
    public static class DummyToAccessNoMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {

        private JoinBytesInt vitem;
        private long pages;
        private long slots;
        private long visits;

        // job side delimiter
        private String delim;
        private Visit visit;

        public void configure (JobConf job)
        {
            try {
                pages = job.getLong("pages", 0);
                slots = job.getLong("slots", 0);
                visits = job.getLong("visits", 0);
                delim = job.get("delimiter");

                visit = new Visit(DistributedCache.getLocalCacheFiles(job),
                        delim, pages);

                vitem = new JoinBytesInt();
                vitem.refs = 1;

            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter)
                        throws IOException {

            int slotId = Integer.parseInt(value.toString().trim());
            visit.fireRandom(slotId);

            for (long i=slotId; i<=visits;) {
                // simply setting url id is fine in map step
                key.set(visit.nextUrlId());
                output.collect(key, vitem);
                i = i + slots;
            }
        }
    }

    public static class SequenceRankingsToUrlsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {
        public JoinBytesInt uitem;

        public void configure(JobConf job) {
            uitem = new JoinBytesInt();
//            getBasicOptions(job);
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            uitem.url= value.toString().split(",")[0].getBytes();
            uitem.ulen = (byte) uitem.url.length;

            output.collect(key, uitem);
        }
    }

    public static class TextRankingsToUrlsMapper extends MapReduceBase implements
    Mapper<LongWritable, Text, LongWritable, JoinBytesInt> {
        public JoinBytesInt uitem;

        public void configure(JobConf job) {
            uitem = new JoinBytesInt();
//            getBasicOptions(job);
        }

        @Override
        public void map(LongWritable key, Text value,
                OutputCollector<LongWritable, JoinBytesInt> output, Reporter reporter) throws IOException {

            String[] items = value.toString().split("[,\t]");
            key.set(Long.parseLong(items[0]));
            uitem.url= items[1].getBytes();
            uitem.ulen = (byte) uitem.url.length;

            output.collect(key, uitem);
        }
    }

    public static class CreateUserVisitsReducer extends MapReduceBase implements
    Reducer<LongWritable, JoinBytesInt, LongWritable, Text> {

        private static final Log log = LogFactory.getLog(CreateUserVisitsReducer.class.getName());

        private long pages;
        private Visit visit;

        private int errors, missed;
        private JoinBytesInt vitem;

        // job side delimiter
        private String delim;
        private int pid;

        public void configure (JobConf job)
        {
            try {
                pages = job.getLong("pages", 0);
                delim = job.get("delimiter");
                pid = job.getInt("mapred.task.partition", 0);

                visit = new Visit(DistributedCache.getLocalCacheFiles(job),
                        delim, pages);
                visit.fireRandom(pid + 1);

                vitem = new JoinBytesInt();

                errors = 0;
                missed = 0;

            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        public void close ()
        {
            log.info("pid: " + pid + ", " + errors + " erros, " + missed + " missed");
        }

        /**
         * Reduce: to sum up the record sizes (of slots) one by one so that to determine the
         * corresponding start point to hold the records for each slot.
         */
        @Override
        public void reduce(LongWritable key, Iterator<JoinBytesInt> values,
                OutputCollector<LongWritable, Text> output, Reporter reporter) throws IOException {

            vitem.clear();
//            StringBuffer sb = new StringBuffer("Reduce: " + v.toString());
            while (values.hasNext()) {
                vitem.add(values.next());
//                sb.append("-> " + v.toString());
            }
//            log.info(sb);

            if (0!=vitem.ulen) {
                if (vitem.refs > 0) {
                    for (int i=0; i<vitem.refs; i++) {
                        Text value = new Text(visit.nextAccess(new String(vitem.url)));
                        output.collect(key, value);
                        reporter.incrCounter(HiBench.Counters.BYTES_DATA_GENERATED, 8+value.getLength());
                    }
                } else {
                    missed++;
                }
            } else {
                errors++;
            }
        }
    }

    private void createUserVisitsTableDirectly() throws IOException, URISyntaxException {

        log.info("Creating user visits...");

        Path rankings = new Path(options.getResultPath(), RANKINGS);
        Path fout = new Path(options.getResultPath(), USERVISITS);

        JobConf job = new JobConf(HiveData.class);
        String jobname = "Create uservisits";
        job.setJobName(jobname);
        setVisitsOptions(job);

        /***
         * Set distributed cache file for table generation,
         * cache files include:
         * 1. user agents
         * 2. country code and language code
         * 3. search keys
         */

        Path uagentPath = new Path(options.getWorkPath(), uagentf);
        DistributedCache.addCacheFile(uagentPath.toUri(), job);

        Path countryPath = new Path(options.getWorkPath(), countryf);
        DistributedCache.addCacheFile(countryPath.toUri(), job);

        Path searchkeyPath = new Path(options.getWorkPath(), searchkeyf);
        DistributedCache.addCacheFile(searchkeyPath.toUri(), job);

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(Text.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(JoinBytesInt.class);

        MultipleInputs.addInputPath(job, dummy.getPath(),
                NLineInputFormat.class, DummyToAccessNoMapper.class);

        if (options.isSequenceOut()) {
            MultipleInputs.addInputPath(job, rankings,
                    SequenceFileInputFormat.class, SequenceRankingsToUrlsMapper.class);
        } else {
            MultipleInputs.addInputPath(job, rankings,
                    TextInputFormat.class, TextRankingsToUrlsMapper.class);
        }

        job.setCombinerClass(JoinBytesIntCombiner.class);
        job.setReducerClass(CreateUserVisitsReducer.class);

        if (options.getNumReds() > 0) {
            job.setNumReduceTasks(options.getNumReds());
        } else {
            job.setNumReduceTasks(Utils.getMaxNumReds());
        }

//        job.setNumReduceTasks(options.slots/2);

        if (options.isSequenceOut()) {
            job.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            job.setOutputFormat(TextOutputFormat.class);
        }

        if (null != options.getCodecClass()) {
            job.set("mapred.output.compression.type","BLOCK");
                        job.set("mapreduce.output.fileoutputformat.compress.type","BLOCK");
            FileOutputFormat.setCompressOutput(job, true);
            FileOutputFormat.setOutputCompressorClass(job, options.getCodecClass());
        }

        FileOutputFormat.setOutputPath(job, fout);

        log.info("Running Job: " +jobname);
        log.info("Dummy file " + dummy.getPath() + " as input");
        log.info("Rankings file " + rankings + " as input");
        log.info("Ouput file " + fout);
        JobClient.runJob(job);
        log.info("Finished Running Job: " + jobname);
    }

    public void generate() throws Exception {

        log.info("Generating hive data files...");
        init();

        createRankingsTableDirectly();
        createUserVisitsTableDirectly();

        close();
    }

    public void loadFiles() throws IOException {
        RawData.createSearchKeys(new Path(options.getWorkPath(), searchkeyf));
        RawData.createUserAgents(new Path(options.getWorkPath(), uagentf));
        RawData.createCCodes(new Path(options.getWorkPath(), countryf));
    }

    private void init() throws IOException {

        log.info("Initializing hive date generator...");

        Utils.checkHdfsPath(options.getResultPath(), true);
        Utils.checkHdfsPath(options.getWorkPath(), true);

        loadFiles();

        Utils.serialLinkZipf(options);

        dummy = new Dummy(options.getWorkPath(), options.getNumMaps());
    }

    public void close() throws IOException {

        log.info("Closing hive data generator...");
        Utils.checkHdfsPath(options.getWorkPath());
    }
}

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析的更多相关文章

第八篇：Spark SQL Catalyst源码分析之UDF
/** Spark SQL源码分析系列文章*/ 在SQL的世界里,除了官方提供的常用的处理函数之外,一般都会提供可扩展的对外自定义函数接口,这已经成为一种事实的标准. 在前面Spark SQL源码分析 ...
第五篇：Spark SQL Catalyst源码分析之Optimizer
/** Spark SQL源码分析系列文章*/ 前几篇文章介绍了Spark SQL的Catalyst的核心运行流程.SqlParser,和Analyzer 以及核心类库TreeNode,本文将详细讲解 ...
第六篇：Spark SQL Catalyst源码分析之Physical Plan
/** Spark SQL源码分析系列文章*/ 前面几篇文章主要介绍的是spark sql包里的的spark sql执行流程,以及Catalyst包内的SqlParser,Analyzer和Optim ...
第四篇：Spark SQL Catalyst源码分析之TreeNode Library
/** Spark SQL源码分析系列文章*/ 前几篇文章介绍了Spark SQL的Catalyst的核心运行流程.SqlParser,和Analyzer,本来打算直接写Optimizer的,但是发现 ...
第三篇：Spark SQL Catalyst源码分析之Analyzer
/** Spark SQL源码分析系列文章*/ 前面几篇文章讲解了Spark SQL的核心执行流程和Spark SQL的Catalyst框架的Sql Parser是怎样接受用户输入sql,经过解析生成 ...
第二篇：Spark SQL Catalyst源码分析之SqlParser
/** Spark SQL源码分析系列文章*/ Spark SQL的核心执行流程我们已经分析完毕,可以参见Spark SQL核心执行流程,下面我们来分析执行流程中各个核心组件的工作职责. 本文先从入口 ...
【原】Spark中Client源码分析（二）
继续前一篇的内容.前一篇内容为: Spark中Client源码分析(一)http://www.cnblogs.com/yourarebest/p/5313006.html DriverClient中的 ...
【原】Spark中Master源码分析（二）
继续上一篇的内容.上一篇的内容为: Spark中Master源码分析(一) http://www.cnblogs.com/yourarebest/p/5312965.html 4.receive方法, ...
【原】 Spark中Worker源码分析（二）
继续前一篇的内容.前一篇内容为: Spark中Worker源码分析(一)http://www.cnblogs.com/yourarebest/p/5300202.html 4.receive方法, r ...
Spark Scheduler模块源码分析之TaskScheduler和SchedulerBackend
本文是Scheduler模块源码分析的第二篇,第一篇Spark Scheduler模块源码分析之DAGScheduler主要分析了DAGScheduler.本文接下来结合Spark-1.6.0的源码继 ...

随机推荐

thinkphp 取消跳转提示
$this->redirect('admin'); 就是直接用redirect
对list集合的内容分组
/** * 把list集合里的内容按照len大小分组 * @param list * @param len * @return */ private static List<List<St ...
idea通过Ctrl+鼠标滚轮放大/缩小字体
ajax相同url和参数，将不会重复发起请求
IE浏览器下使用GET发送请求时,如果两次请求的地址和参数相同,在不刷新页面的情况下,浏览器会缓存第一次请求的内容,服务端更新后浏览器仍然显示第一次的内容. 解决办法: 一. GET请求URL后加随机 ...
php 高级多台web服务器共享session的方法
解决多台web服务器共享session的问题,至少有以下三种方法: 一.将本该保存在web服务器磁盘上的session数据保存到cookie中即用cookie会话机制替代session会话机制, ...
[read -p应用]插拔光模块去检查port present状态
#!/bin/bash path="/sys/devices/platform/soc/fd880000.i2c-pld/i2c-0/i2c-4/i2c-15/15-0060" a ...
linux服务器自动备份与删除postgres数据库数据
1.先创一个back.sh 文件,授权,然后在下面这个文件添加脚本 export PGPASSWORD='123456' #这是登录服务器密码cur_time=`date +%Y%m%d ...
02-04Android学习进度报告四
今天主要学习Android界面的构建,包括Textview.EdixtText.Button等元素的应用. 关于Textview,主要是以下属性: id:为TextView设置一个组件id,根据id, ...
vscode调试开发C/C++程序
https://www.cnblogs.com/TAMING/p/8560253.html
Jdk的删除和配置
电脑里很久以前装的jdk7.0,现在升级到jdk8,为了避免冲突先彻底删除再重新安装新的jdk. 卸载: 设置——应用——搜索java(会显示两个)——左键点击——卸载安装: jdk-8u212-w ...

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析

HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析的更多相关文章

随机推荐

热门专题