Injector(org.apache.nutch.crawl.Injector):

  • 输入:种子列表文件所在的目录
  • 输出:crawldb(保存URL以及其相应信息的数据库)
  • 作用:把种子URL注入到crawldb

package org.apache.nutch.crawl;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;

// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import org.apache.nutch.net.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/**
* This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system. The URL files contain one URL
* per line, optionally followed by custom metadata separated by tabs with the
* metadata key separated from the corresponding value by '='. <br>
* Note that some metadata keys are reserved : <br>
* - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
* - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a
* specific URL <br>
* - <i>nutch.fetchInterval.fixed</i> : allows to set a custom fetch interval
* for a specific URL that is not changed by AdaptiveFetchSchedule <br>
* e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000
* \t userType=open_source
**/
public class Injector extends Configured implements Tool {
public static final Logger LOG = LoggerFactory.getLogger(Injector.class);

/** metadata key reserved for setting a custom score for a specific URL */
public static String nutchScoreMDName = "nutch.score";
/**
* metadata key reserved for setting a custom fetchInterval for a specific URL
*/
public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
/**
* metadata key reserved for setting a fixed custom fetchInterval for a
* specific URL
*/
public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";

/** Normalize and filter injected urls. */
public static class InjectMapper implements
Mapper<WritableComparable<?>, Text, Text, CrawlDatum> {
private URLNormalizers urlNormalizers;
private int interval;
private float scoreInjected;
private JobConf jobConf;
private URLFilters filters;
private ScoringFilters scfilters;
private long curTime;

public void configure(JobConf job) {
this.jobConf = job;
urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
interval = jobConf.getInt("db.fetch.interval.default", 2592000);
filters = new URLFilters(jobConf);
scfilters = new ScoringFilters(jobConf);
scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
curTime = job
.getLong("injector.current.time", System.currentTimeMillis());
}

public void close() {
}

public void map(WritableComparable<?> key, Text value,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
String url = value.toString().trim(); // value is line of text

if (url != null && (url.length() == 0 || url.startsWith("#"))) {
/* Ignore line that start with # */
return;
}

// if tabs : metadata that could be stored
// must be name=value and separated by \t
float customScore = -1f;
int customInterval = interval;
int fixedInterval = -1;
Map<String, String> metadata = new TreeMap<String, String>();
if (url.indexOf("\t") != -1) {
String[] splits = url.split("\t");
url = splits[0];
for (int s = 1; s < splits.length; s++) {
// find separation between name and value
int indexEquals = splits[s].indexOf("=");
if (indexEquals == -1) {
// skip anything without a =
continue;
}
String metaname = splits[s].substring(0, indexEquals);
String metavalue = splits[s].substring(indexEquals + 1);
if (metaname.equals(nutchScoreMDName)) {
try {
customScore = Float.parseFloat(metavalue);
} catch (NumberFormatException nfe) {
}
} else if (metaname.equals(nutchFetchIntervalMDName)) {
try {
customInterval = Integer.parseInt(metavalue);
} catch (NumberFormatException nfe) {
}
} else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
try {
fixedInterval = Integer.parseInt(metavalue);
} catch (NumberFormatException nfe) {
}
} else
metadata.put(metaname, metavalue);
}
}
try {
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
url = filters.filter(url); // filter the url
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Skipping " + url + ":" + e);
}
url = null;
}
if (url == null) {
reporter.getCounter("injector", "urls_filtered").increment(1);
} else { // if it passes
value.set(url); // collect it
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);

// Is interval custom? Then set as meta data
if (fixedInterval > -1) {
// Set writable using float. Flaot is used by
// AdaptiveFetchSchedule
datum.getMetaData().put(Nutch.WRITABLE_FIXED_INTERVAL_KEY,
new FloatWritable(fixedInterval));
datum.setFetchInterval(fixedInterval);
} else {
datum.setFetchInterval(customInterval);
}

datum.setFetchTime(curTime);
// now add the metadata
Iterator<String> keysIter = metadata.keySet().iterator();
while (keysIter.hasNext()) {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
datum.getMetaData().put(new Text(keymd), new Text(valuemd));
}
if (customScore != -1)
datum.setScore(customScore);
else
datum.setScore(scoreInjected);
try {
scfilters.injectedScore(value, datum);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Cannot filter injected score for url " + url
+ ", using default (" + e.getMessage() + ")");
}
}
reporter.getCounter("injector", "urls_injected").increment(1);
output.collect(value, datum);
}
}
}

/** Combine multiple new entries for a url. */
public static class InjectReducer implements
Reducer<Text, CrawlDatum, Text, CrawlDatum> {
private int interval;
private float scoreInjected;
private boolean overwrite = false;
private boolean update = false;

public void configure(JobConf job) {
interval = job.getInt("db.fetch.interval.default", 2592000);
scoreInjected = job.getFloat("db.score.injected", 1.0f);
overwrite = job.getBoolean("db.injector.overwrite", false);
update = job.getBoolean("db.injector.update", false);
LOG.info("Injector: overwrite: " + overwrite);
LOG.info("Injector: update: " + update);
}

public void close() {
}

private CrawlDatum old = new CrawlDatum();
private CrawlDatum injected = new CrawlDatum();

public void reduce(Text key, Iterator<CrawlDatum> values,
OutputCollector<Text, CrawlDatum> output, Reporter reporter)
throws IOException {
boolean oldSet = false;
boolean injectedSet = false;
while (values.hasNext()) {
CrawlDatum val = values.next();
if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
injected.set(val);
injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
injectedSet = true;
} else {
old.set(val);
oldSet = true;
}

}

CrawlDatum res = null;

// Old default behaviour
if (injectedSet && !oldSet) {
res = injected;
} else {
res = old;
}
if (injectedSet && oldSet) {
reporter.getCounter("injector", "urls_merged").increment(1);
}
/**
* Whether to overwrite, ignore or update existing records
*
* @see https://issues.apache.org/jira/browse/NUTCH-1405
*/
// Injected record already exists and update but not overwrite
if (injectedSet && oldSet && update && !overwrite) {
res = old;
old.putAllMetaData(injected);
old.setScore(injected.getScore() != scoreInjected ? injected.getScore()
: old.getScore());
old.setFetchInterval(injected.getFetchInterval() != interval ? injected
.getFetchInterval() : old.getFetchInterval());
}

// Injected record already exists and overwrite
if (injectedSet && oldSet && overwrite) {
res = injected;
}

output.collect(key, res);
}
}

public Injector() {
}

public Injector(Configuration conf) {
setConf(conf);
}

public void inject(Path crawlDb, Path urlDir) throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
if (LOG.isInfoEnabled()) {
LOG.info("Injector: starting at " + sdf.format(start));
LOG.info("Injector: crawlDb: " + crawlDb);
LOG.info("Injector: urlDir: " + urlDir);
}

Path tempDir = new Path(getConf().get("mapred.temp.dir", ".")
+ "/inject-temp-"
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

// map text input file to a <url,CrawlDatum> file
if (LOG.isInfoEnabled()) {
LOG.info("Injector: Converting injected urls to crawl db entries.");
}

FileSystem fs = FileSystem.get(getConf());
// determine if the crawldb already exists
boolean dbExists = fs.exists(crawlDb);

JobConf sortJob = new NutchJob(getConf());
sortJob.setJobName("inject " + urlDir);
FileInputFormat.addInputPath(sortJob, urlDir);
sortJob.setMapperClass(InjectMapper.class);

FileOutputFormat.setOutputPath(sortJob, tempDir);
if (dbExists) {
// Don't run merge injected urls, wait for merge with
// existing DB
sortJob.setOutputFormat(SequenceFileOutputFormat.class);
sortJob.setNumReduceTasks(0);
} else {
sortJob.setOutputFormat(MapFileOutputFormat.class);
sortJob.setReducerClass(InjectReducer.class);
sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs",
false);
}
sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
sortJob.setLong("injector.current.time", System.currentTimeMillis());

RunningJob mapJob = null;
try {
mapJob = JobClient.runJob(sortJob);
} catch (IOException e) {
fs.delete(tempDir, true);
throw e;
}
long urlsInjected = mapJob.getCounters()
.findCounter("injector", "urls_injected").getValue();
long urlsFiltered = mapJob.getCounters()
.findCounter("injector", "urls_filtered").getValue();
LOG.info("Injector: Total number of urls rejected by filters: "
+ urlsFiltered);
LOG.info("Injector: Total number of urls after normalization: "
+ urlsInjected);
long urlsMerged = 0;
if (dbExists) {
// merge with existing crawl db
if (LOG.isInfoEnabled()) {
LOG.info("Injector: Merging injected urls into crawl db.");
}
JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
FileInputFormat.addInputPath(mergeJob, tempDir);
mergeJob.setReducerClass(InjectReducer.class);
try {
RunningJob merge = JobClient.runJob(mergeJob);
urlsMerged = merge.getCounters().findCounter("injector", "urls_merged")
.getValue();
LOG.info("Injector: URLs merged: " + urlsMerged);
} catch (IOException e) {
fs.delete(tempDir, true);
throw e;
}
CrawlDb.install(mergeJob, crawlDb);
} else {
CrawlDb.install(sortJob, crawlDb);
}

// clean up
fs.delete(tempDir, true);
LOG.info("Injector: Total new urls injected: "
+ (urlsInjected - urlsMerged));
long end = System.currentTimeMillis();
LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
}

public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
System.exit(res);
}

public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: Injector <crawldb> <url_dir>");
return -1;
}
try {
inject(new Path(args[0]), new Path(args[1]));
return 0;
} catch (Exception e) {
LOG.error("Injector: " + StringUtils.stringifyException(e));
return -1;
}
}

}

Nutch主要类代码分析之一(Injector)的更多相关文章

  1. cocos2d-x v3.2 FlappyBird 各个类对象详细代码分析(6)

    今天我们要讲三个类,这三个类应该算比較简单的 HelpLayer类 NumberLayer类 GetLocalScore类 HelpLayer类,主要放了两个图形精灵上去,一个是游戏的名字,一个是提示 ...

  2. 完整全面的Java资源库(包括构建、操作、代码分析、编译器、数据库、社区等等)

    构建 这里搜集了用来构建应用程序的工具. Apache Maven:Maven使用声明进行构建并进行依赖管理,偏向于使用约定而不是配置进行构建.Maven优于Apache Ant.后者采用了一种过程化 ...

  3. Android代码分析工具lint学习

    1 lint简介 1.1 概述 lint是随Android SDK自带的一个静态代码分析工具.它用来对Android工程的源文件进行检查,找出在正确性.安全.性能.可使用性.可访问性及国际化等方面可能 ...

  4. MapReduce剖析笔记之八: Map输出数据的处理类MapOutputBuffer分析

    在上一节我们分析了Child子进程启动,处理Map.Reduce任务的主要过程,但对于一些细节没有分析,这一节主要对MapOutputBuffer这个关键类进行分析. MapOutputBuffer顾 ...

  5. pmd静态代码分析

    在正式进入测试之前,进行一定的静态代码分析及code review对代码质量及系统提高是有帮助的,以上为数据证明 Pmd 它是一个基于静态规则集的Java源码分析器,它可以识别出潜在的如下问题:– 可 ...

  6. [Asp.net 5] DependencyInjection项目代码分析4-微软的实现(5)(IEnumerable<>补充)

    Asp.net 5的依赖注入注入系列可以参考链接: [Asp.net 5] DependencyInjection项目代码分析-目录 我们在之前讲微软的实现时,对于OpenIEnumerableSer ...

  7. 常用 Java 静态代码分析工具的分析与比较

    常用 Java 静态代码分析工具的分析与比较 简介: 本文首先介绍了静态代码分析的基 本概念及主要技术,随后分别介绍了现有 4 种主流 Java 静态代码分析工具 (Checkstyle,FindBu ...

  8. angular代码分析之异常日志设计

    angular代码分析之异常日志设计 错误异常是面向对象开发中的记录提示程序执行问题的一种重要机制,在程序执行发生问题的条件下,异常会在中断程序执行,同时会沿着代码的执行路径一步一步的向上抛出异常,最 ...

  9. [Asp.net 5] DependencyInjection项目代码分析4-微软的实现(2)

    在 DependencyInjection项目代码分析4-微软的实现(1)中介绍了“ServiceTable”.“ServiceEntry”.“IGenericService”.“IService”. ...

随机推荐

  1. OC基础--结构体 枚举做类成员属性

    结构体  枚举作类的成员属性: 定义一个学生类 性别 -- 枚举 生日 入学日期  毕业日期  --  结构体 代码示例: 声明文件 Student.h: #import <Foundation ...

  2. 1、jvm的体系结构

    jvm包括两子系统两组件 a.两子系统:Class Loader子系统,Execution engine子系统 b.两组件:Runtime Date Area 和 Native Interface

  3. 尝试利用CentOS环境安装LiteSpeed WEB服务环境的过程

    对于普通的网站搭建的环境虚拟主机就足够使用,不过近期公司的网站需要上线VPS主机,于是采用到LNMP(http://lnmp.org/)一键包安装的,运行还是比较好的,这不最近我也开始尝试接触VPS方 ...

  4. Android内存优化之 LruCache与DiskLruCache

    在日常的Adroid开发中我们经常遇到需要处理大量图片的地方,但Android手机的内存有限该怎么避免手机 内存溢出导致app程序oom,google提供了两种解决方式 LruCache LruCac ...

  5. Windows 8.1 with Update MSDN 简体/英文/繁体

    PS:请不要使用离线下载,以免镜像损坏! 1.CN: 文件名:cn_windows_8.1_enterprise_with_update_x64_dvd_4048578.isoSHA1:2D9BFE9 ...

  6. C#如何获取物理网卡,虚拟网卡,以及无线网卡

    就不废话了,直接上代码 /// <summary></summary> /// 显示本机各网卡的详细信息 /// <summary></summary> ...

  7. TypeScript 中的 "=>" 真的很好用!!!

    class funs { public $scope: IBarPadScope; constructor($scope: IBarPadScope) { this.$scope = $scope; ...

  8. grep查询文本:问一个简单shell问题,将grep的输出赋值给一个变量

    问一个简单shell问题,将grep的输出赋值给一个变量 用grep命令得到的输出赋值给一个变量不成功. grep命令如下: 代码: $ grep -c '^abc' file.txt 输出为22,表 ...

  9. uboot(二): Uboot-arm-start.s分析

    声明:该贴是通过参考其他人的帖子整理出来,从中我加深了对uboot的理解,我知道对其他人一定也是有很大的帮助,不敢私藏,如果里面的注释有什么错误请给我回复,我再加以修改.有些部分可能还没解释清楚,如果 ...

  10. 深入理解JavaScript系列:为什么03-0.2不等于0.1

    五一宅家看书,所以接着更新一篇文章. 今天讲一下为什么03-0.2不等于0.1这个问题. 有点标题党的味道,在JavaScript中,当你试着对小数进行加减运算时,有时候会发现某个结果并非我们所想的那 ...