/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.mapreduce; import static java.lang.String.format; import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Base64;
import java.util.HashSet;
import java.util.Set; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableNotEnabledException;
import org.apache.hadoop.hbase.TableNotFoundException;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.base.Splitter;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists; /**
* Tool to import data from a TSV file.
*
* This tool is rather simplistic - it doesn't do any quoting or
* escaping, but is useful for many data loads.
*
* @see ImportTsv#usage(String)
*/
@InterfaceAudience.Public
public class ImportTsv extends Configured implements Tool { protected static final Logger LOG = LoggerFactory.getLogger(ImportTsv.class); final static String NAME = "importtsv"; public final static String MAPPER_CONF_KEY = "importtsv.mapper.class";
public final static String BULK_OUTPUT_CONF_KEY = "importtsv.bulk.output";
public final static String TIMESTAMP_CONF_KEY = "importtsv.timestamp";
public final static String JOB_NAME_CONF_KEY = "mapreduce.job.name";
// TODO: the rest of these configs are used exclusively by TsvImporterMapper.
// Move them out of the tool and let the mapper handle its own validation.
public final static String DRY_RUN_CONF_KEY = "importtsv.dry.run";
// If true, bad lines are logged to stderr. Default: false.
public final static String LOG_BAD_LINES_CONF_KEY = "importtsv.log.bad.lines";
public final static String SKIP_LINES_CONF_KEY = "importtsv.skip.bad.lines";
public final static String SKIP_EMPTY_COLUMNS = "importtsv.skip.empty.columns";
public final static String COLUMNS_CONF_KEY = "importtsv.columns";
public final static String SEPARATOR_CONF_KEY = "importtsv.separator";
public final static String ATTRIBUTE_SEPERATOR_CONF_KEY = "attributes.seperator";
//This config is used to propagate credentials from parent MR jobs which launch
//ImportTSV jobs. SEE IntegrationTestImportTsv.
public final static String CREDENTIALS_LOCATION = "credentials_location";
final static String DEFAULT_SEPARATOR = "\t";
final static String DEFAULT_ATTRIBUTES_SEPERATOR = "=>";
final static String DEFAULT_MULTIPLE_ATTRIBUTES_SEPERATOR = ",";
final static Class DEFAULT_MAPPER = TsvImporterMapper.class;
public final static String CREATE_TABLE_CONF_KEY = "create.table";
public final static String NO_STRICT_COL_FAMILY = "no.strict";
/**
* If table didn't exist and was created in dry-run mode, this flag is
* flipped to delete it when MR ends.
*/
private static boolean DRY_RUN_TABLE_CREATED; public static class TsvParser {
/**
* Column families and qualifiers mapped to the TSV columns
*/
private final byte[][] families;
private final byte[][] qualifiers; private final byte separatorByte; private int rowKeyColumnIndex; private int maxColumnCount; // Default value must be negative
public static final int DEFAULT_TIMESTAMP_COLUMN_INDEX = -1; private int timestampKeyColumnIndex = DEFAULT_TIMESTAMP_COLUMN_INDEX; public static final String ROWKEY_COLUMN_SPEC = "HBASE_ROW_KEY"; public static final String TIMESTAMPKEY_COLUMN_SPEC = "HBASE_TS_KEY"; public static final String ATTRIBUTES_COLUMN_SPEC = "HBASE_ATTRIBUTES_KEY"; public static final String CELL_VISIBILITY_COLUMN_SPEC = "HBASE_CELL_VISIBILITY"; public static final String CELL_TTL_COLUMN_SPEC = "HBASE_CELL_TTL"; private int attrKeyColumnIndex = DEFAULT_ATTRIBUTES_COLUMN_INDEX; public static final int DEFAULT_ATTRIBUTES_COLUMN_INDEX = -1; public static final int DEFAULT_CELL_VISIBILITY_COLUMN_INDEX = -1; public static final int DEFAULT_CELL_TTL_COLUMN_INDEX = -1; private int cellVisibilityColumnIndex = DEFAULT_CELL_VISIBILITY_COLUMN_INDEX; private int cellTTLColumnIndex = DEFAULT_CELL_TTL_COLUMN_INDEX; /**
* @param columnsSpecification the list of columns to parser out, comma separated.
* The row key should be the special token TsvParser.ROWKEY_COLUMN_SPEC
* @param separatorStr
*/
public TsvParser(String columnsSpecification, String separatorStr) {
// Configure separator
byte[] separator = Bytes.toBytes(separatorStr);
Preconditions.checkArgument(separator.length == 1,
"TsvParser only supports single-byte separators");
separatorByte = separator[0]; // Configure columns
ArrayList<String> columnStrings = Lists.newArrayList(
Splitter.on(',').trimResults().split(columnsSpecification)); maxColumnCount = columnStrings.size();
families = new byte[maxColumnCount][];
qualifiers = new byte[maxColumnCount][]; for (int i = 0; i < columnStrings.size(); i++) {
String str = columnStrings.get(i);
if (ROWKEY_COLUMN_SPEC.equals(str)) {
rowKeyColumnIndex = i;
continue;
}
if (TIMESTAMPKEY_COLUMN_SPEC.equals(str)) {
timestampKeyColumnIndex = i;
continue;
}
if (ATTRIBUTES_COLUMN_SPEC.equals(str)) {
attrKeyColumnIndex = i;
continue;
}
if (CELL_VISIBILITY_COLUMN_SPEC.equals(str)) {
cellVisibilityColumnIndex = i;
continue;
}
if (CELL_TTL_COLUMN_SPEC.equals(str)) {
cellTTLColumnIndex = i;
continue;
}
String[] parts = str.split(":", 2);
if (parts.length == 1) {
families[i] = str.getBytes();
qualifiers[i] = HConstants.EMPTY_BYTE_ARRAY;
} else {
families[i] = parts[0].getBytes();
qualifiers[i] = parts[1].getBytes();
}
}
} public boolean hasTimestamp() {
return timestampKeyColumnIndex != DEFAULT_TIMESTAMP_COLUMN_INDEX;
} public int getTimestampKeyColumnIndex() {
return timestampKeyColumnIndex;
} public boolean hasAttributes() {
return attrKeyColumnIndex != DEFAULT_ATTRIBUTES_COLUMN_INDEX;
} public boolean hasCellVisibility() {
return cellVisibilityColumnIndex != DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
} public boolean hasCellTTL() {
return cellTTLColumnIndex != DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
} public int getAttributesKeyColumnIndex() {
return attrKeyColumnIndex;
} public int getCellVisibilityColumnIndex() {
return cellVisibilityColumnIndex;
} public int getCellTTLColumnIndex() {
return cellTTLColumnIndex;
} public int getRowKeyColumnIndex() {
return rowKeyColumnIndex;
} public byte[] getFamily(int idx) {
return families[idx];
}
public byte[] getQualifier(int idx) {
return qualifiers[idx];
} public ParsedLine parse(byte[] lineBytes, int length)
throws BadTsvLineException {
// Enumerate separator offsets
ArrayList<Integer> tabOffsets = new ArrayList<>(maxColumnCount);
for (int i = 0; i < length; i++) {
if (lineBytes[i] == separatorByte) {
tabOffsets.add(i);
}
}
if (tabOffsets.isEmpty()) {
throw new BadTsvLineException("No delimiter");
} tabOffsets.add(length); if (tabOffsets.size() > maxColumnCount) {
throw new BadTsvLineException("Excessive columns");
} else if (tabOffsets.size() <= getRowKeyColumnIndex()) {
throw new BadTsvLineException("No row key");
} else if (hasTimestamp()
&& tabOffsets.size() <= getTimestampKeyColumnIndex()) {
throw new BadTsvLineException("No timestamp");
} else if (hasAttributes() && tabOffsets.size() <= getAttributesKeyColumnIndex()) {
throw new BadTsvLineException("No attributes specified");
} else if (hasCellVisibility() && tabOffsets.size() <= getCellVisibilityColumnIndex()) {
throw new BadTsvLineException("No cell visibility specified");
} else if (hasCellTTL() && tabOffsets.size() <= getCellTTLColumnIndex()) {
throw new BadTsvLineException("No cell TTL specified");
}
return new ParsedLine(tabOffsets, lineBytes);
} class ParsedLine {
private final ArrayList<Integer> tabOffsets;
private byte[] lineBytes; ParsedLine(ArrayList<Integer> tabOffsets, byte[] lineBytes) {
this.tabOffsets = tabOffsets;
this.lineBytes = lineBytes;
} public int getRowKeyOffset() {
return getColumnOffset(rowKeyColumnIndex);
}
public int getRowKeyLength() {
return getColumnLength(rowKeyColumnIndex);
} public long getTimestamp(long ts) throws BadTsvLineException {
// Return ts if HBASE_TS_KEY is not configured in column spec
if (!hasTimestamp()) {
return ts;
} String timeStampStr = Bytes.toString(lineBytes,
getColumnOffset(timestampKeyColumnIndex),
getColumnLength(timestampKeyColumnIndex));
try {
return Long.parseLong(timeStampStr);
} catch (NumberFormatException nfe) {
// treat this record as bad record
throw new BadTsvLineException("Invalid timestamp " + timeStampStr);
}
} private String getAttributes() {
if (!hasAttributes()) {
return null;
} else {
return Bytes.toString(lineBytes, getColumnOffset(attrKeyColumnIndex),
getColumnLength(attrKeyColumnIndex));
}
} public String[] getIndividualAttributes() {
String attributes = getAttributes();
if (attributes != null) {
return attributes.split(DEFAULT_MULTIPLE_ATTRIBUTES_SEPERATOR);
} else {
return null;
}
} public int getAttributeKeyOffset() {
if (hasAttributes()) {
return getColumnOffset(attrKeyColumnIndex);
} else {
return DEFAULT_ATTRIBUTES_COLUMN_INDEX;
}
} public int getAttributeKeyLength() {
if (hasAttributes()) {
return getColumnLength(attrKeyColumnIndex);
} else {
return DEFAULT_ATTRIBUTES_COLUMN_INDEX;
}
} public int getCellVisibilityColumnOffset() {
if (hasCellVisibility()) {
return getColumnOffset(cellVisibilityColumnIndex);
} else {
return DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
}
} public int getCellVisibilityColumnLength() {
if (hasCellVisibility()) {
return getColumnLength(cellVisibilityColumnIndex);
} else {
return DEFAULT_CELL_VISIBILITY_COLUMN_INDEX;
}
} public String getCellVisibility() {
if (!hasCellVisibility()) {
return null;
} else {
return Bytes.toString(lineBytes, getColumnOffset(cellVisibilityColumnIndex),
getColumnLength(cellVisibilityColumnIndex));
}
} public int getCellTTLColumnOffset() {
if (hasCellTTL()) {
return getColumnOffset(cellTTLColumnIndex);
} else {
return DEFAULT_CELL_TTL_COLUMN_INDEX;
}
} public int getCellTTLColumnLength() {
if (hasCellTTL()) {
return getColumnLength(cellTTLColumnIndex);
} else {
return DEFAULT_CELL_TTL_COLUMN_INDEX;
}
} public long getCellTTL() {
if (!hasCellTTL()) {
return 0;
} else {
return Bytes.toLong(lineBytes, getColumnOffset(cellTTLColumnIndex),
getColumnLength(cellTTLColumnIndex));
}
} public int getColumnOffset(int idx) {
if (idx > 0)
return tabOffsets.get(idx - 1) + 1;
else
return 0;
}
public int getColumnLength(int idx) {
return tabOffsets.get(idx) - getColumnOffset(idx);
}
public int getColumnCount() {
return tabOffsets.size();
}
public byte[] getLineBytes() {
return lineBytes;
}
} public static class BadTsvLineException extends Exception {
public BadTsvLineException(String err) {
super(err);
}
private static final long serialVersionUID = 1L;
} /**
* Return starting position and length of row key from the specified line bytes.
* @param lineBytes
* @param length
* @return Pair of row key offset and length.
* @throws BadTsvLineException
*/
public Pair<Integer, Integer> parseRowKey(byte[] lineBytes, int length)
throws BadTsvLineException {
int rkColumnIndex = 0;
int startPos = 0, endPos = 0;
for (int i = 0; i <= length; i++) {
if (i == length || lineBytes[i] == separatorByte) {
endPos = i - 1;
if (rkColumnIndex++ == getRowKeyColumnIndex()) {
if ((endPos + 1) == startPos) {
throw new BadTsvLineException("Empty value for ROW KEY.");
}
break;
} else {
startPos = endPos + 2;
}
}
if (i == length) {
throw new BadTsvLineException(
"Row key does not exist as number of columns in the line"
+ " are less than row key position.");
}
}
return new Pair<>(startPos, endPos - startPos + 1);
}
} /**
* Sets up the actual job.
*
* @param conf The current configuration.
* @param args The command line parameters.
* @return The newly created job.
* @throws IOException When setting up the job fails.
*/
protected static Job createSubmittableJob(Configuration conf, String[] args)
throws IOException, ClassNotFoundException {
Job job = null;
boolean isDryRun = conf.getBoolean(DRY_RUN_CONF_KEY, false);
try (Connection connection = ConnectionFactory.createConnection(conf)) {
try (Admin admin = connection.getAdmin()) {
// Support non-XML supported characters
// by re-encoding the passed separator as a Base64 string.
String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
if (actualSeparator != null) {
conf.set(SEPARATOR_CONF_KEY,
Bytes.toString(Base64.getEncoder().encode(actualSeparator.getBytes())));
} // See if a non-default Mapper was set
String mapperClassName = conf.get(MAPPER_CONF_KEY);
Class mapperClass = mapperClassName != null? Class.forName(mapperClassName): DEFAULT_MAPPER; TableName tableName = TableName.valueOf(args[0]);
Path inputDir = new Path(args[1]);
String jobName = conf.get(JOB_NAME_CONF_KEY,NAME + "_" + tableName.getNameAsString());
job = Job.getInstance(conf, jobName);
job.setJarByClass(mapperClass);
FileInputFormat.setInputPaths(job, inputDir);
job.setInputFormatClass(TextInputFormat.class);
job.setMapperClass(mapperClass);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
String[] columns = conf.getStrings(COLUMNS_CONF_KEY);
if(StringUtils.isNotEmpty(conf.get(CREDENTIALS_LOCATION))) {
String fileLoc = conf.get(CREDENTIALS_LOCATION);
Credentials cred = Credentials.readTokenStorageFile(new File(fileLoc), conf);
job.getCredentials().addAll(cred);
} if (hfileOutPath != null) {
if (!admin.tableExists(tableName)) {
LOG.warn(format("Table '%s' does not exist.", tableName));
if ("yes".equalsIgnoreCase(conf.get(CREATE_TABLE_CONF_KEY, "yes"))) {
// TODO: this is backwards. Instead of depending on the existence of a table,
// create a sane splits file for HFileOutputFormat based on data sampling.
createTable(admin, tableName, columns);
if (isDryRun) {
LOG.warn("Dry run: Table will be deleted at end of dry run.");
synchronized (ImportTsv.class) {
DRY_RUN_TABLE_CREATED = true;
}
}
} else {
String errorMsg =
format("Table '%s' does not exist and '%s' is set to no.", tableName,
CREATE_TABLE_CONF_KEY);
LOG.error(errorMsg);
throw new TableNotFoundException(errorMsg);
}
}
try (Table table = connection.getTable(tableName);
RegionLocator regionLocator = connection.getRegionLocator(tableName)) {
boolean noStrict = conf.getBoolean(NO_STRICT_COL_FAMILY, false);
// if no.strict is false then check column family
if(!noStrict) {
ArrayList<String> unmatchedFamilies = new ArrayList<>();
Set<String> cfSet = getColumnFamilies(columns);
TableDescriptor tDesc = table.getDescriptor();
for (String cf : cfSet) {
if(!tDesc.hasColumnFamily(Bytes.toBytes(cf))) {
unmatchedFamilies.add(cf);
}
}
if(unmatchedFamilies.size() > 0) {
ArrayList<String> familyNames = new ArrayList<>();
for (ColumnFamilyDescriptor family : table.getDescriptor().getColumnFamilies()) {
familyNames.add(family.getNameAsString());
}
String msg =
"Column Families " + unmatchedFamilies + " specified in " + COLUMNS_CONF_KEY
+ " does not match with any of the table " + tableName
+ " column families " + familyNames + ".\n"
+ "To disable column family check, use -D" + NO_STRICT_COL_FAMILY
+ "=true.\n";
usage(msg);
System.exit(-1);
}
}
if (mapperClass.equals(TsvImporterTextMapper.class)) {
job.setMapOutputValueClass(Text.class);
job.setReducerClass(TextSortReducer.class);
} else {
job.setMapOutputValueClass(Put.class);
job.setCombinerClass(PutCombiner.class);
job.setReducerClass(PutSortReducer.class);
}
if (!isDryRun) {
Path outputDir = new Path(hfileOutPath);
FileOutputFormat.setOutputPath(job, outputDir);
HFileOutputFormat2.configureIncrementalLoad(job, table.getDescriptor(),
regionLocator);
}
}
} else {
if (!admin.tableExists(tableName)) {
String errorMsg = format("Table '%s' does not exist.", tableName);
LOG.error(errorMsg);
throw new TableNotFoundException(errorMsg);
}
if (mapperClass.equals(TsvImporterTextMapper.class)) {
usage(TsvImporterTextMapper.class.toString()
+ " should not be used for non bulkloading case. use "
+ TsvImporterMapper.class.toString()
+ " or custom mapper whose value type is Put.");
System.exit(-1);
}
if (!isDryRun) {
// No reducers. Just write straight to table. Call initTableReducerJob
// to set up the TableOutputFormat.
TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job);
}
job.setNumReduceTasks(0);
}
if (isDryRun) {
job.setOutputFormatClass(NullOutputFormat.class);
job.getConfiguration().setStrings("io.serializations",
job.getConfiguration().get("io.serializations"),
MutationSerialization.class.getName(), ResultSerialization.class.getName(),
CellSerialization.class.getName());
}
TableMapReduceUtil.addDependencyJars(job);
TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(),
org.apache.hbase.thirdparty.com.google.common.base.Function.class /* Guava used by TsvParser */);
}
}
return job;
} private static void createTable(Admin admin, TableName tableName, String[] columns)
throws IOException {
HTableDescriptor htd = new HTableDescriptor(tableName);
Set<String> cfSet = getColumnFamilies(columns);
for (String cf : cfSet) {
HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toBytes(cf));
htd.addFamily(hcd);
}
LOG.warn(format("Creating table '%s' with '%s' columns and default descriptors.",
tableName, cfSet));
admin.createTable(htd);
} private static void deleteTable(Configuration conf, String[] args) {
TableName tableName = TableName.valueOf(args[0]);
try (Connection connection = ConnectionFactory.createConnection(conf);
Admin admin = connection.getAdmin()) {
try {
admin.disableTable(tableName);
} catch (TableNotEnabledException e) {
LOG.debug("Dry mode: Table: " + tableName + " already disabled, so just deleting it.");
}
admin.deleteTable(tableName);
} catch (IOException e) {
LOG.error(format("***Dry run: Failed to delete table '%s'.***%n%s", tableName,
e.toString()));
return;
}
LOG.info(format("Dry run: Deleted table '%s'.", tableName));
} private static Set<String> getColumnFamilies(String[] columns) {
Set<String> cfSet = new HashSet<>();
for (String aColumn : columns) {
if (TsvParser.ROWKEY_COLUMN_SPEC.equals(aColumn)
|| TsvParser.TIMESTAMPKEY_COLUMN_SPEC.equals(aColumn)
|| TsvParser.CELL_VISIBILITY_COLUMN_SPEC.equals(aColumn)
|| TsvParser.CELL_TTL_COLUMN_SPEC.equals(aColumn)
|| TsvParser.ATTRIBUTES_COLUMN_SPEC.equals(aColumn))
continue;
// we are only concerned with the first one (in case this is a cf:cq)
cfSet.add(aColumn.split(":", 2)[0]);
}
return cfSet;
} /*
* @param errorMsg Error message. Can be null.
*/
private static void usage(final String errorMsg) {
if (errorMsg != null && errorMsg.length() > 0) {
System.err.println("ERROR: " + errorMsg);
}
String usage =
"Usage: " + NAME + " -D"+ COLUMNS_CONF_KEY + "=a,b,c <tablename> <inputdir>\n" +
"\n" +
"Imports the given input directory of TSV data into the specified table.\n" +
"\n" +
"The column names of the TSV data must be specified using the -D" + COLUMNS_CONF_KEY + "\n" +
"option. This option takes the form of comma-separated column names, where each\n" +
"column name is either a simple column family, or a columnfamily:qualifier. The special\n" +
"column name " + TsvParser.ROWKEY_COLUMN_SPEC + " is used to designate that this column should be used\n" +
"as the row key for each imported record. You must specify exactly one column\n" +
"to be the row key, and you must specify a column name for every column that exists in the\n" +
"input data. Another special column" + TsvParser.TIMESTAMPKEY_COLUMN_SPEC +
" designates that this column should be\n" +
"used as timestamp for each record. Unlike " + TsvParser.ROWKEY_COLUMN_SPEC + ", " +
TsvParser.TIMESTAMPKEY_COLUMN_SPEC + " is optional." + "\n" +
"You must specify at most one column as timestamp key for each imported record.\n" +
"Record with invalid timestamps (blank, non-numeric) will be treated as bad record.\n" +
"Note: if you use this option, then '" + TIMESTAMP_CONF_KEY + "' option will be ignored.\n" +
"\n" +
"Other special columns that can be specified are " + TsvParser.CELL_TTL_COLUMN_SPEC +
" and " + TsvParser.CELL_VISIBILITY_COLUMN_SPEC + ".\n" +
TsvParser.CELL_TTL_COLUMN_SPEC + " designates that this column will be used " +
"as a Cell's Time To Live (TTL) attribute.\n" +
TsvParser.CELL_VISIBILITY_COLUMN_SPEC + " designates that this column contains the " +
"visibility label expression.\n" +
"\n" +
TsvParser.ATTRIBUTES_COLUMN_SPEC+" can be used to specify Operation Attributes per record.\n"+
" Should be specified as key=>value where "+TsvParser.DEFAULT_ATTRIBUTES_COLUMN_INDEX+ " is used \n"+
" as the seperator. Note that more than one OperationAttributes can be specified.\n"+
"By default importtsv will load data directly into HBase. To instead generate\n" +
"HFiles of data to prepare for a bulk data load, pass the option:\n" +
" -D" + BULK_OUTPUT_CONF_KEY + "=/path/for/output\n" +
" Note: if you do not use this option, then the target table must already exist in HBase\n" +
"\n" +
"Other options that may be specified with -D include:\n" +
" -D" + DRY_RUN_CONF_KEY + "=true - Dry run mode. Data is not actually populated into" +
" table. If table does not exist, it is created but deleted in the end.\n" +
" -D" + SKIP_LINES_CONF_KEY + "=false - fail if encountering an invalid line\n" +
" -D" + LOG_BAD_LINES_CONF_KEY + "=true - logs invalid lines to stderr\n" +
" -D" + SKIP_EMPTY_COLUMNS + "=false - If true then skip empty columns in bulk import\n" +
" '-D" + SEPARATOR_CONF_KEY + "=|' - eg separate on pipes instead of tabs\n" +
" -D" + TIMESTAMP_CONF_KEY + "=currentTimeAsLong - use the specified timestamp for the import\n" +
" -D" + MAPPER_CONF_KEY + "=my.Mapper - A user-defined Mapper to use instead of " +
DEFAULT_MAPPER.getName() + "\n" +
" -D" + JOB_NAME_CONF_KEY + "=jobName - use the specified mapreduce job name for the import\n" +
" -D" + CREATE_TABLE_CONF_KEY + "=no - can be used to avoid creation of table by this tool\n" +
" Note: if you set this to 'no', then the target table must already exist in HBase\n" +
" -D" + NO_STRICT_COL_FAMILY + "=true - ignore column family check in hbase table. " +
"Default is false\n\n" +
"For performance consider the following options:\n" +
" -Dmapreduce.map.speculative=false\n" +
" -Dmapreduce.reduce.speculative=false"; System.err.println(usage);
} @Override
public int run(String[] args) throws Exception {
if (args.length < 2) {
usage("Wrong number of arguments: " + args.length);
return -1;
} // When MAPPER_CONF_KEY is null, the user wants to use the provided TsvImporterMapper, so
// perform validation on these additional args. When it's not null, user has provided their
// own mapper, thus these validation are not relevant.
// TODO: validation for TsvImporterMapper, not this tool. Move elsewhere.
if (null == getConf().get(MAPPER_CONF_KEY)) {
// Make sure columns are specified
String[] columns = getConf().getStrings(COLUMNS_CONF_KEY);
if (columns == null) {
usage("No columns specified. Please specify with -D" +
COLUMNS_CONF_KEY+"=...");
return -1;
} // Make sure they specify exactly one column as the row key
int rowkeysFound = 0;
for (String col : columns) {
if (col.equals(TsvParser.ROWKEY_COLUMN_SPEC)) rowkeysFound++;
}
if (rowkeysFound != 1) {
usage("Must specify exactly one column as " + TsvParser.ROWKEY_COLUMN_SPEC);
return -1;
} // Make sure we have at most one column as the timestamp key
int tskeysFound = 0;
for (String col : columns) {
if (col.equals(TsvParser.TIMESTAMPKEY_COLUMN_SPEC))
tskeysFound++;
}
if (tskeysFound > 1) {
usage("Must specify at most one column as "
+ TsvParser.TIMESTAMPKEY_COLUMN_SPEC);
return -1;
} int attrKeysFound = 0;
for (String col : columns) {
if (col.equals(TsvParser.ATTRIBUTES_COLUMN_SPEC))
attrKeysFound++;
}
if (attrKeysFound > 1) {
usage("Must specify at most one column as "
+ TsvParser.ATTRIBUTES_COLUMN_SPEC);
return -1;
} // Make sure one or more columns are specified excluding rowkey and
// timestamp key
if (columns.length - (rowkeysFound + tskeysFound + attrKeysFound) < 1) {
usage("One or more columns in addition to the row key and timestamp(optional) are required");
return -1;
}
} // If timestamp option is not specified, use current system time.
long timstamp = getConf().getLong(TIMESTAMP_CONF_KEY, System.currentTimeMillis()); // Set it back to replace invalid timestamp (non-numeric) with current
// system time
getConf().setLong(TIMESTAMP_CONF_KEY, timstamp); synchronized (ImportTsv.class) {
DRY_RUN_TABLE_CREATED = false;
}
Job job = createSubmittableJob(getConf(), args);
boolean success = job.waitForCompletion(true);
boolean delete = false;
synchronized (ImportTsv.class) {
delete = DRY_RUN_TABLE_CREATED;
}
if (delete) {
deleteTable(getConf(), args);
}
return success ? 0 : 1;
} public static void main(String[] args) throws Exception {
int status = ToolRunner.run(HBaseConfiguration.create(), new ImportTsv(), args);
System.exit(status);
}
}

HBase源码实战:ImportTsv的更多相关文章

  1. HBase源码实战:BufferedMutator

    /** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agr ...

  2. HBase源码实战:CreateRandomStoreFile

    /* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agre ...

  3. Hbase源码分析:Hbase UI中Requests Per Second的具体含义

    Hbase源码分析:Hbase UI中Requests Per Second的具体含义 让运维加监控,被问到Requests Per Second(见下图)的具体含义是什么?我一时竟回答不上来,虽然大 ...

  4. hbase源码系列(十二)Get、Scan在服务端是如何处理?

    继上一篇讲了Put和Delete之后,这一篇我们讲Get和Scan, 因为我发现这两个操作几乎是一样的过程,就像之前的Put和Delete一样,上一篇我本来只打算写Put的,结果发现Delete也可以 ...

  5. hbase源码带注释版本,放在这里,方便大家下载吧

    看了5个月的hbase源码,记录了一些笔记,如果有需要的朋友可以拿去. 里面总共包括几个主要的工程吧:hbase-common,hbase-client,hbase-prefix-tree,hbase ...

  6. HBase源码学习系列

    转自:http://www.cnblogs.com/cenyuhai/tag/hbase%E6%BA%90%E7%A0%81%E7%B3%BB%E5%88%97/ (mark) hbase源码系列(十 ...

  7. Hbase源码分析:RPC概况

    RPC是hbase中Master,RegionServer和Client三者之间通信交流的纽带.了解hbase的rpc机制能够为通过源码学习hbase奠定良好的基础.因为了解了hbase的rpc机制能 ...

  8. 11 hbase源码系列(十一)Put、Delete在服务端是如何处理

    hbase源码系列(十一)Put.Delete在服务端是如何处理?    在讲完之后HFile和HLog之后,今天我想分享是Put在Region Server经历些了什么?相信前面看了<HTab ...

  9. hbase源码系列(十二)Get、Scan在服务端是如何处理

    hbase源码系列(十二)Get.Scan在服务端是如何处理?   继上一篇讲了Put和Delete之后,这一篇我们讲Get和Scan, 因为我发现这两个操作几乎是一样的过程,就像之前的Put和Del ...

随机推荐

  1. Unity资源 ----加载最好需要做哪些事

    先上图解 一.基本关键词 1)AssetBundle:一种保存“一个或多个资源的转变为某种利于传输等的特殊格式(二进制之类)”的文件.(我这边是使用Unity制作手游的角度来说明) 简称AB. 2)对 ...

  2. synchronized底层实现学习

    上文我们总结了 synchronized 关键字的基本用法以及作用,并未涉及 synchronized 底层是如何实现的,所谓刨根问底,本文我们就开始 synchronized 原理的探索之旅吧(*& ...

  3. python学习第二讲,pythonIDE介绍以及配置使用

    目录 python学习第二讲,pythonIDE介绍以及配置使用 一丶集成开发环境IDE简介,以及配置 1.简介 2.PyCharm 介绍 3.pycharm 的安装 二丶IDE 开发Python,以 ...

  4. Docker系列08—搭建使用私有docker registry

    本文收录在容器技术学习系列文章总目录 1.了解Docker Registry 1.1 介绍 registry 用于保存docker 镜像,包括镜像的层次结构和元数据. 启动容器时,docker dae ...

  5. Python四步实现决策树ID3算法,参考机器学习实战

    一.编写计算历史数据的经验熵函数 from math import log def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCo ...

  6. django中url路由配置及渲染方式

    今天我们学习如何配置url.如何传参.如何命名.以及渲染的方式,内容大致有以下几个方面. 创建视图函数并访问 创建app django中url规则 捕获参数 路径转换器 正则表达式 额外参数 渲染方式 ...

  7. 如何正确使用Java泛型

    前言 Java 1.5之前是没有泛型的,以前从集合中读取每个对象都必须先进行转换,如果不小心存入集合中对象类型是错的,运行过程中转换处理会报错.有了泛型之后编译器会自动帮助转换,使程序更加安全,但是要 ...

  8. C#组件系列——又一款日志组件:Elmah的学习和分享

    前言:好久没动笔了,都有点生疏,12月都要接近尾声,可是这月连一篇的产出都没有,不能坏了“规矩”,今天还是来写一篇.最近个把月确实很忙,不过每天早上还是会抽空来园子里逛逛.一如既往,园子里每年这个时候 ...

  9. 命令别名设置: alias, unalias

    别名命令:alias 命令别名是一个很有趣的东西,特别是你的惯用指令特别长的时候!还有, 增设默认的选项在一些惯用的指令上面,可以预防一些不小心误杀文件的情况发生的时候! 举个例子来说,如果你要查询隐 ...

  10. 【Config】类库读取自己的配置文件,配置文件的扩展

    我们在项目中一般都是使用统一的项目文件配置,所有的配置和自定义的字段都写在一个web.config或者App.config文件中.一般平时我们也没有发现问题,确实这么写没有问题,但是就是如果写的多了就 ...