diff --git a/pom.xml b/pom.xml
index cc0ebc5..c6fa7ac 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,410 +1,410 @@
4.0.0
org.warcbase
warcbase
jar
0.1.0-SNAPSHOT
warcbase
WARC + HBase
http://warcbase.org/
The Apache Software License, Version 2.0
http://www.apache.org/licenses/LICENSE-2.0.txt
repo
scm:git:git@github.com:lintool/warcbase.git
scm:git:git@github.com:lintool/warcbase.git
git@github.com:lintool/warcbase.git
lintool
Jimmy Lin
jimmylin@umd.edu
milad621
Milad Gholami
mgholami@cs.umd.edu
jeffyRao
Jinfeng Rao
jinfeng@cs.umd.edu
org.sonatype.oss
oss-parent
7
UTF-8
UTF-8
8.1.12.v20130726
2.0.0-cdh4.4.0
0.94.6-cdh4.4.0
3.4.5-cdh4.4.0
1.0.0
org.apache.maven.plugins
maven-shade-plugin
2.1
package
shade
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
true
fatjar
org.apache.hadoop:*
org.codehaus.mojo
appassembler-maven-plugin
1.3.1
-Xms512M -Xmx1524M
org.warcbase.WarcbaseAdmin
WarcbaseAdmin
org.warcbase.data.UriMappingBuilder
UriMappingBuilder
org.warcbase.data.UriMapping
- UriMapping
+ UrlMapping
org.warcbase.data.ExtractLinks
ExtractLinks
org.warcbase.data.ExtractSiteLinks
ExtractSiteLinks
org.warcbase.ingest.IngestFiles
IngestFiles
org.warcbase.ingest.SearchForUri
SearchForUri
org.warcbase.browser.WarcBrowser
WarcBrowser
org.warcbase.analysis.CountRowTypes
CountRowTypes
org.warcbase.analysis.DetectDuplicates
DetectDuplicates
org.warcbase.analysis.PrintAllUris
PrintAllUris
org.warcbase.analysis.ExtractText
ExtractText
internetarchive
Internet Archive Maven Repository
http://builds.archive.org:8080/maven2
cloudera
https://repository.cloudera.com/artifactory/cloudera-repos/
maven
http://repo.maven.apache.org/maven2/
junit
junit
4.11
test
commons-codec
commons-codec
1.8
org.jsoup
jsoup
1.7.3
org.apache.lucene
lucene-core
4.5.1
com.google.guava
guava
14.0.1
tl.lin
lintools-datatypes
0.9.2
org.apache.hbase
hbase
${hbase.version}
slf4j-api
slf4j-api
org.slf4j
slf4j-api
org.slf4j
slf4j-api-1.4.3
jsp-api
jsp-api
org.mortbay.jetty
jsp-api
org.mortbay.jetty
jsp-api-2.1
org.mortbay.jetty
servlet-api-2.5
org.mortbay.jetty
servlet-api
servlet-api
servlet-api
org.mortbay.jetty
jetty
org.mortbay.jetty
jetty-util
org.mortbay.jetty
jsp-2.1
org.apache.hadoop
hadoop-common
${hadoop.version2}
javax.servlet
servlet-api
org.mortbay.jetty
jetty
org.mortbay.jetty
jetty-util
org.mortbay.jetty
jsp-2.1
org.apache.hadoop
hadoop-client
${hadoop.version2}
javax.servlet
servlet-api
org.mortbay.jetty
jetty
org.mortbay.jetty
jetty-util
org.mortbay.jetty
jsp-2.1
org.apache.zookeeper
zookeeper
${zookeeper.version}
org.netpreserve.openwayback
openwayback-core
2.0.0.BETA.2
org.apache.hadoop
hadoop-core
ch.qos.logback
logback-classic
org.eclipse.jetty
jetty-server
${jettyVersion}
org.eclipse.jetty
jetty-webapp
${jettyVersion}
true
org.slf4j
slf4j-log4j12
1.6.4
org.jwat
jwat-common
${jwat.version}
org.jwat
jwat-gzip
${jwat.version}
org.jwat
jwat-arc
${jwat.version}
org.jwat
jwat-warc
${jwat.version}
org.apache.commons
commons-lang3
3.0
net.sf.opencsv
opencsv
2.3
org.netpreserve.commons
webarchive-commons
1.1.3
diff --git a/src/main/java/org/warcbase/analysis/graph/ExtractLinks.java b/src/main/java/org/warcbase/analysis/graph/ExtractLinks.java
index 83cca9d..e4641b7 100644
--- a/src/main/java/org/warcbase/analysis/graph/ExtractLinks.java
+++ b/src/main/java/org/warcbase/analysis/graph/ExtractLinks.java
@@ -1,394 +1,394 @@
package org.warcbase.analysis.graph;
import it.unimi.dsi.fastutil.ints.IntAVLTreeSet;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.NavigableMap;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jwat.arc.ArcRecordBase;
import org.warcbase.analysis.graph.PrefixMapping.PrefixNode;
-import org.warcbase.data.UriMapping;
+import org.warcbase.data.UrlMapping;
import org.warcbase.mapreduce.ArcInputFormat;
import com.google.common.base.Joiner;
/**
* Program for extracting links from ARC files.
*/
public class ExtractLinks extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(ExtractLinks.class);
private static enum Records {
TOTAL, LINK_COUNT
};
public static class ExtractLinksHDFSMapper extends
Mapper {
private static final Joiner JOINER = Joiner.on(",");
public static final IntWritable KEY = new IntWritable();
private static final Text VALUE = new Text();
private static final DateFormat df = new SimpleDateFormat("yyyyMMdd");
- private static UriMapping fst;
+ private static UrlMapping fst;
private static String beginDate, endDate;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
beginDate = conf.get("beginDate");
endDate = conf.get("endDate");
// There appears to be a bug in getCacheFiles() which returns null,
// even though getLocalCacheFiles is deprecated...
@SuppressWarnings("deprecation")
Path[] localFiles = context.getLocalCacheFiles();
LOG.info("cache contents: " + Arrays.toString(localFiles));
System.out.println("cache contents: " + Arrays.toString(localFiles));
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
// simply assume only one file in distributed cache.
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(LongWritable key, ArcRecordBase record, Context context)
throws IOException, InterruptedException {
context.getCounter(Records.TOTAL).increment(1);
String url = record.getUrlStr();
String type = record.getContentTypeStr();
Date date = record.getArchiveDate();
if (date == null) {
return;
}
String time = df.format(date);
InputStream content = record.getPayloadContent();
if (beginDate != null && endDate != null) {
if (time.compareTo(beginDate) < 0 || time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate == null && endDate != null) {
if (time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate != null && endDate == null) {
if (time.compareTo(beginDate) < 0) {
return;
}
}
if (!type.equals("text/html")) {
return;
}
Document doc = Jsoup.parse(content, "ISO-8859-1", url); // parse in ISO-8859-1 format
Elements links = doc.select("a[href]"); // empty if none match
if (fst.getID(url) != -1) { // the url is already indexed in UriMapping
KEY.set(fst.getID(url));
IntAVLTreeSet linkUrlSet = new IntAVLTreeSet();
if (links != null) {
for (Element link : links) {
String linkUrl = link.attr("abs:href");
if (fst.getID(linkUrl) != -1) { // link already exists
linkUrlSet.add(fst.getID(linkUrl));
}
}
if (linkUrlSet.size() == 0) {
// Emit empty entry even if there aren't any outgoing links
VALUE.set("");
context.write(KEY, VALUE);
return;
}
VALUE.set(JOINER.join(linkUrlSet));
context.getCounter(Records.LINK_COUNT).increment(linkUrlSet.size());
context.write(KEY, VALUE);
}
}
}
}
public static class ExtractLinksHBaseMapper extends TableMapper{
public static final byte[] COLUMN_FAMILY = Bytes.toBytes("links");
private static final Joiner JOINER = Joiner.on(",");
public static final IntWritable KEY = new IntWritable();
private static final Text VALUE = new Text();
private static final DateFormat df = new SimpleDateFormat("yyyyMMdd");
- private static UriMapping fst;
+ private static UrlMapping fst;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
@SuppressWarnings("deprecation")
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(ImmutableBytesWritable row, Result result, Context context)
throws IOException, InterruptedException {
context.getCounter(Records.TOTAL).increment(1);
int sourceFstId = fst.getID(new String(row.get()));
// rowkey(url) is not indexed in FST
if ( sourceFstId == -1) {
return;
}
KEY.set(sourceFstId);
IntAVLTreeSet linkUrlSet = new IntAVLTreeSet();
// Assume HBase Table Format
// Row : sourceUrl
// Column Family : links
// Column Qualifier: targetUrl
// Value: 1 (1 denotes the existence of this link)
NavigableMap familyMap = result.getFamilyMap(COLUMN_FAMILY);
for(byte[] column: familyMap.keySet()){
//byte[] value = familyMap.get(column);
int targetFstId = fst.getID(new String(column));
if (targetFstId != -1){
linkUrlSet.add(targetFstId);
}
}
if (linkUrlSet.size() == 0) {
// Emit empty entry even if there aren't any outgoing links
VALUE.set("");
context.write(KEY, VALUE);
return;
}
VALUE.set(JOINER.join(linkUrlSet));
context.getCounter(Records.LINK_COUNT).increment(linkUrlSet.size());
context.write(KEY, VALUE);
}
}
/**
* Creates an instance of this tool.
*/
public ExtractLinks() {}
private static final String HDFS = "hdfs";
private static final String HBASE = "hbase";
private static final String OUTPUT = "output";
private static final String URI_MAPPING = "uriMapping";
private static final String BEGIN = "begin";
private static final String END = "end";
private static String beginDate = null, endDate = null;
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("HDFS input path").create(HDFS));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("HBASE table name").create(HBASE));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("uri mapping file path").create(URI_MAPPING));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("begin date (optional)").create(BEGIN));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("end date (optional)").create(END));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if ( (!cmdline.hasOption(HDFS) && !cmdline.hasOption(HBASE)) // No HDFS and HBase input
|| !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(URI_MAPPING)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
FileSystem fs = FileSystem.get(getConf());
String HDFSPath = null, HBaseTableName = null;
boolean isHDFSInput = true; // set default as HDFS input
if (cmdline.hasOption(HDFS)) {
HDFSPath = cmdline.getOptionValue(HDFS);
} else {
HBaseTableName = cmdline.getOptionValue(HBASE);
isHDFSInput = false;
}
String outputPath = cmdline.getOptionValue(OUTPUT);
Path mappingPath = new Path(cmdline.getOptionValue(URI_MAPPING));
LOG.info("Tool: " + ExtractLinks.class.getSimpleName());
if (isHDFSInput) {
LOG.info(" - HDFS input path: " + HDFSPath);
} else {
LOG.info(" - HBase table name: " + HBaseTableName);
}
LOG.info(" - output path: " + outputPath);
LOG.info(" - mapping file path: " + mappingPath);
if (cmdline.hasOption(BEGIN)) {
beginDate = cmdline.getOptionValue(BEGIN);
LOG.info(" - begin date: " + beginDate);
}
if (cmdline.hasOption(END)) {
endDate = cmdline.getOptionValue(END);
LOG.info(" - end date: " + endDate);
}
if (!fs.exists(mappingPath)) {
throw new Exception("mappingPath doesn't exist: " + mappingPath);
}
Configuration conf;
if (isHDFSInput) {
conf = getConf();
// passing global variable values to individual nodes
if(beginDate != null) {
conf.set("beginDate", beginDate);
}
if(endDate != null) {
conf.set("endDate", endDate);
}
} else {
conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "bespinrm.umiacs.umd.edu");
}
Job job = Job.getInstance(conf, ExtractLinks.class.getSimpleName());
job.setJarByClass(ExtractLinks.class);
- job.getConfiguration().set("UriMappingClass", UriMapping.class.getCanonicalName());
+ job.getConfiguration().set("UriMappingClass", UrlMapping.class.getCanonicalName());
// Put the mapping file in the distributed cache so each map worker will have it.
job.addCacheFile(mappingPath.toUri());
job.setNumReduceTasks(0); // no reducers
if (isHDFSInput) { // HDFS input
FileInputFormat.setInputPaths(job, new Path(HDFSPath));
job.setInputFormatClass(ArcInputFormat.class);
// set map (key,value) output format
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(ExtractLinksHDFSMapper.class);
} else { // HBase input
Scan scan = new Scan();
// Very conservative settings because a single row might not fit in memory
// if we have many captured version of a URL.
scan.setCaching(1); // Controls the number of rows to pre-fetch
scan.setBatch(10); // Controls the number of columns to fetch on a per row basis
scan.setCacheBlocks(false); // Don't set to true for MR jobs
scan.setMaxVersions(); // We want all versions
TableMapReduceUtil.initTableMapperJob(
HBaseTableName, // input HBase table name
scan, // Scan instance to control CF and attribute selection
ExtractLinksHBaseMapper.class, // mapper
IntWritable.class, // mapper output key
Text.class, // mapper output value
job);
job.setOutputFormatClass(TextOutputFormat.class); // set output format
}
FileOutputFormat.setOutputPath(job, new Path(outputPath));
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
fs.delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
Counters counters = job.getCounters();
int numRecords = (int) counters.findCounter(Records.TOTAL).getValue();
int numLinks = (int) counters.findCounter(Records.LINK_COUNT).getValue();
LOG.info("Read " + numRecords + " records.");
LOG.info("Extracts " + numLinks + " links.");
return 0;
}
/**
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new ExtractLinks(), args);
}
}
\ No newline at end of file
diff --git a/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java b/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java
index 784567e..34c251d 100644
--- a/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java
+++ b/src/main/java/org/warcbase/analysis/graph/ExtractSiteLinks.java
@@ -1,455 +1,455 @@
package org.warcbase.analysis.graph;
import it.unimi.dsi.fastutil.ints.Int2IntAVLTreeMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jwat.arc.ArcRecordBase;
import org.warcbase.analysis.graph.PrefixMapping.PrefixNode;
-import org.warcbase.data.UriMapping;
+import org.warcbase.data.UrlMapping;
import org.warcbase.mapreduce.ArcInputFormat;
import com.google.common.base.Joiner;
public class ExtractSiteLinks extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(ExtractSiteLinks.class);
private static enum Records {
TOTAL, LINK_COUNT
};
// HDFS ExtractSiteLinks Mapper
public static class ExtractSiteLinksHDFSMapper extends
Mapper {
private static final DateFormat df = new SimpleDateFormat("yyyyMMdd");
private static String beginDate, endDate;
private static final IntWritable KEY = new IntWritable();
private static final IntWritable VALUE = new IntWritable();
- private static UriMapping fst;
+ private static UrlMapping fst;
private static PrefixMapping prefixMap;
private static ArrayList prefix;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
beginDate = conf.get("beginDate");
endDate = conf.get("endDate");
@SuppressWarnings("deprecation")
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
// load Prefix Mapping from file
prefixMap = (PrefixMapping) Class.forName(conf.get("PrefixMappingClass")).newInstance();
prefix = prefixMap.loadPrefix(localFiles[1].toString(), fst);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(LongWritable key, ArcRecordBase record, Context context) throws IOException,
InterruptedException {
context.getCounter(Records.TOTAL).increment(1);
String url = record.getUrlStr();
String type = record.getContentTypeStr();
Date date = record.getArchiveDate();
if (date == null) {
return;
}
String time = df.format(date);
InputStream content = record.getPayloadContent();
if (beginDate != null && endDate != null) {
if (time.compareTo(beginDate) < 0 || time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate == null && endDate != null) {
if (time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate != null && endDate == null) {
if (time.compareTo(beginDate) < 0) {
return;
}
}
if (!type.equals("text/html")) {
return;
}
Document doc = Jsoup.parse(content, "ISO-8859-1", url); // parse in ISO-8859-1 format
Elements links = doc.select("a[href]"); // empty if none match
if (links == null) {
return;
}
int sourcePrefixId = prefixMap.getPrefixId(fst.getID(url), prefix);
// this url is indexed in FST and its prefix is appeared in prefix map (thus declared in
// prefix file)
if (fst.getID(url) != -1 && sourcePrefixId != -1) {
KEY.set(sourcePrefixId);
List linkUrlList = new ArrayList();
for (Element link : links) {
String linkUrl = link.attr("abs:href");
int targetPrefixId = prefixMap.getPrefixId(fst.getID(linkUrl), prefix);
// target url is indexed in FST and its prefix url is found
if (fst.getID(linkUrl) != -1 && targetPrefixId != -1) {
linkUrlList.add(targetPrefixId);
}
}
for (Integer linkID : linkUrlList) {
VALUE.set(linkID);
context.write(KEY, VALUE);
}
}
} // end map function
} // End of HDFS ExtractSiteLinks Mapper
// HBase ExtractSiteLinks Mapper
public static class ExtractSiteLinksHBaseMapper extends TableMapper {
public static final byte[] COLUMN_FAMILY = Bytes.toBytes("links");
private static final DateFormat df = new SimpleDateFormat("yyyyMMdd");
private static String beginDate, endDate;
private static final IntWritable KEY = new IntWritable();
private static final IntWritable VALUE = new IntWritable();
- private static UriMapping fst;
+ private static UrlMapping fst;
private static PrefixMapping prefixMap;
private static ArrayList prefix;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
@SuppressWarnings("deprecation")
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
// load Prefix Mapping from file
prefixMap = (PrefixMapping) Class.forName(conf.get("PrefixMappingClass")).newInstance();
prefix = prefixMap.loadPrefix(localFiles[1].toString(), fst);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(ImmutableBytesWritable row, Result result, Context context) throws IOException,
InterruptedException {
context.getCounter(Records.TOTAL).increment(1);
int sourceFstId = fst.getID(new String(row.get()));
// rowkey(url) is not indexed in FST
if (sourceFstId == -1) {
return;
}
int sourcePrefixId = prefixMap.getPrefixId(sourceFstId, prefix);
// this url is indexed in FST and its prefix is appeared in prefix map
// (thus declared in prefix file)
if (sourcePrefixId != -1) {
KEY.set(sourcePrefixId);
List linkUrlList = new ArrayList();
// Assume HBase Table Format
// Row : sourceUrl
// Column Family : links
// Column Qualifier: targetUrl
// Value: 1 (1 denotes the existence of this link)
NavigableMap familyMap = result.getFamilyMap(COLUMN_FAMILY);
for (byte[] column : familyMap.keySet()) {
// byte[] value = familyMap.get(column);
int targetFstId = fst.getID(new String(column));
if (targetFstId != -1) {
int targetPrefixId = prefixMap.getPrefixId(targetFstId, prefix);
if (targetPrefixId != -1) {
linkUrlList.add(targetPrefixId);
}
}
}
for (Integer linkID : linkUrlList) {
VALUE.set(linkID);
context.write(KEY, VALUE);
}
}
}
} // End of HBase ExtractSiteLinks Mapper
private static class ExtractSiteLinksReducer extends
Reducer {
@Override
public void reduce(IntWritable key, Iterable values, Context context)
throws IOException, InterruptedException {
Int2IntAVLTreeMap links = new Int2IntAVLTreeMap();
// remove duplicate links
for (IntWritable value : values) {
if (links.containsKey(value.get())) {
// increment 1 link count
links.put(value.get(), links.get(value.get()) + 1);
} else {
links.put(value.get(), 1);
}
}
context.getCounter(Records.LINK_COUNT).increment(links.entrySet().size());
for (Entry link : links.entrySet()) {
String outputValue = String.valueOf(link.getKey()) + "," + String.valueOf(link.getValue());
context.write(key, new Text(outputValue));
}
}
}
/**
* Creates an instance of this tool.
*/
public ExtractSiteLinks() {
}
private static final String HDFS = "hdfs";
private static final String HBASE = "hbase";
private static final String OUTPUT = "output";
private static final String URI_MAPPING = "uriMapping";
private static final String PREFIX_FILE = "prefixFile";
private static final String NUM_REDUCERS = "numReducers";
private static final String BEGIN = "begin";
private static final String END = "end";
private static String beginDate = null, endDate = null;
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS input path")
.create(HDFS));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("HBASE table name").create(HBASE));
options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path")
.create(OUTPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("uri mapping file path").create(URI_MAPPING));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("prefix mapping file path").create(PREFIX_FILE));
options.addOption(OptionBuilder.withArgName("num").hasArg()
.withDescription("number of reducers").create(NUM_REDUCERS));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("begin date (optional)").create(BEGIN));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("end date (optional)").create(END));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if ((!cmdline.hasOption(HDFS) && !cmdline.hasOption(HBASE)) // No HDFS and HBase input
|| !cmdline.hasOption(OUTPUT)
|| !cmdline.hasOption(URI_MAPPING)
|| !cmdline.hasOption(PREFIX_FILE)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
FileSystem fs = FileSystem.get(getConf());
String HDFSPath = null, HBaseTableName = null;
boolean isHDFSInput = true; // set default as HDFS input
if (cmdline.hasOption(HDFS)) {
HDFSPath = cmdline.getOptionValue(HDFS);
} else {
HBaseTableName = cmdline.getOptionValue(HBASE);
isHDFSInput = false;
}
String outputPath = cmdline.getOptionValue(OUTPUT);
Path mappingPath = new Path(cmdline.getOptionValue(URI_MAPPING));
Path prefixFilePath = new Path(cmdline.getOptionValue(PREFIX_FILE));
int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline
.getOptionValue(NUM_REDUCERS)) : 1;
LOG.info("Tool: " + ExtractSiteLinks.class.getSimpleName());
if (isHDFSInput) {
LOG.info(" - HDFS input path: " + HDFSPath);
} else {
LOG.info(" - HBase table name: " + HBaseTableName);
}
LOG.info(" - output path: " + outputPath);
LOG.info(" - mapping file path:" + mappingPath);
LOG.info(" - prefix file path:" + prefixFilePath);
LOG.info(" - number of reducers: " + reduceTasks);
if (cmdline.hasOption(BEGIN)) {
beginDate = cmdline.getOptionValue(BEGIN);
LOG.info(" - begin date: " + beginDate);
}
if (cmdline.hasOption(END)) {
endDate = cmdline.getOptionValue(END);
LOG.info(" - end date: " + endDate);
}
if (!fs.exists(mappingPath)) {
throw new Exception("mappingPath doesn't exist: " + mappingPath);
}
if (!fs.exists(prefixFilePath)) {
throw new Exception("prefixFilePath doesn't exist: " + prefixFilePath);
}
Configuration conf;
if (isHDFSInput) {
conf = getConf();
// passing global variable values to individual nodes
if(beginDate != null) {
conf.set("beginDate", beginDate);
}
if(endDate != null) {
conf.set("endDate", endDate);
}
} else {
conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "bespinrm.umiacs.umd.edu");
}
Job job = Job.getInstance(conf, ExtractSiteLinks.class.getSimpleName());
job.setJarByClass(ExtractSiteLinks.class);
- job.getConfiguration().set("UriMappingClass", UriMapping.class.getCanonicalName());
+ job.getConfiguration().set("UriMappingClass", UrlMapping.class.getCanonicalName());
job.getConfiguration().set("PrefixMappingClass", PrefixMapping.class.getCanonicalName());
// Put the mapping file and prefix file in the distributed cache
// so each map worker will have it.
job.addCacheFile(mappingPath.toUri());
job.addCacheFile(prefixFilePath.toUri());
job.setNumReduceTasks(reduceTasks); // no reducers
if (isHDFSInput) { // HDFS input
FileInputFormat.setInputPaths(job, new Path(HDFSPath));
job.setInputFormatClass(ArcInputFormat.class);
// set map (key,value) output format
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setMapperClass(ExtractSiteLinksHDFSMapper.class);
} else { // HBase input
Scan scan = new Scan();
// Very conservative settings because a single row might not fit in memory
// if we have many captured version of a URL.
scan.setCaching(1); // Controls the number of rows to pre-fetch
scan.setBatch(10); // Controls the number of columns to fetch on a per row basis
scan.setCacheBlocks(false); // Don't set to true for MR jobs
scan.setMaxVersions(); // We want all versions
TableMapReduceUtil.initTableMapperJob(HBaseTableName, // input HBase table name
scan, // Scan instance to control CF and attribute selection
ExtractSiteLinksHBaseMapper.class, // mapper
IntWritable.class, // mapper output key
IntWritable.class, // mapper output value
job);
job.setOutputFormatClass(TextOutputFormat.class); // set output format
}
FileOutputFormat.setOutputPath(job, new Path(outputPath));
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setReducerClass(ExtractSiteLinksReducer.class);
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
FileSystem.get(job.getConfiguration()).delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
Counters counters = job.getCounters();
int numRecords = (int) counters.findCounter(Records.TOTAL).getValue();
int numLinks = (int) counters.findCounter(Records.LINK_COUNT).getValue();
LOG.info("Read " + numRecords + " records.");
LOG.info("Extracts " + numLinks + " links.");
return 0;
}
/**
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new ExtractSiteLinks(), args);
}
}
\ No newline at end of file
diff --git a/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java b/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java
index d262999..fb740c1 100644
--- a/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java
+++ b/src/main/java/org/warcbase/analysis/graph/InvertAnchorText.java
@@ -1,389 +1,389 @@
package org.warcbase.analysis.graph;
import it.unimi.dsi.fastutil.ints.Int2ObjectMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.jwat.arc.ArcRecordBase;
-import org.warcbase.data.UriMapping;
+import org.warcbase.data.UrlMapping;
import org.warcbase.data.UrlUtil;
import org.warcbase.mapreduce.ArcInputFormat;
import com.google.common.collect.Lists;
/**
* Program for extracting links from ARC files or HBase.
*/
public class InvertAnchorText extends Configured implements Tool {
private static final Logger LOG = Logger.getLogger(InvertAnchorText.class);
private static enum MyCounters {
RECORDS, HTML_PAGES, LINKS
};
- private static Int2ObjectMap> extractLinks(InputStream content, String url, UriMapping fst)
+ private static Int2ObjectMap> extractLinks(InputStream content, String url, UrlMapping fst)
throws IOException {
Document doc = Jsoup.parse(content, "ISO-8859-1", url); // parse in ISO-8859-1 format
Elements links = doc.select("a[href]");
// Note that if there are outgoing links to the same destination page, we retain all copies
// (and their anchor texts). This behavior is explicitly different from that of ExtractLinks,
// which de-duplicates outgoing links to the same destination.
Int2ObjectMap> anchors = new Int2ObjectOpenHashMap>();
if (links != null) {
for (Element link : links) {
String linkUrl = link.attr("abs:href");
int id = fst.getID(linkUrl);
if (id != -1) {
if (anchors.containsKey(id)) {
anchors.get(id).add(link.text());
} else {
anchors.put(id, Lists.newArrayList(link.text()));
}
}
}
}
return anchors;
}
public static class InvertAnchorTextHdfsMapper extends
Mapper {
private final DateFormat df = new SimpleDateFormat("yyyyMMdd");
private final IntWritable key = new IntWritable();
private final Text value = new Text();
- private UriMapping fst;
+ private UrlMapping fst;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
// There appears to be a bug in getCacheFiles() which returns null,
// even though getLocalCacheFiles is deprecated...
@SuppressWarnings("deprecation")
Path[] localFiles = context.getLocalCacheFiles();
LOG.info("cache contents: " + Arrays.toString(localFiles));
System.out.println("cache contents: " + Arrays.toString(localFiles));
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
// simply assume only one file in distributed cache.
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(LongWritable k, ArcRecordBase record, Context context)
throws IOException, InterruptedException {
context.getCounter(MyCounters.RECORDS).increment(1);
String url = record.getUrlStr();
String type = record.getContentTypeStr();
Date date = record.getArchiveDate();
String time = df.format(date);
InputStream content = record.getPayloadContent();
if (beginDate != null && endDate != null) {
if (time.compareTo(beginDate) < 0 || time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate == null && endDate != null) {
if (time.compareTo(endDate) > 0) {
return;
}
} else if (beginDate != null && endDate == null) {
if (time.compareTo(beginDate) < 0) {
return;
}
}
int srcId = fst.getID(url);
if (!type.equals("text/html") || srcId == -1) {
return;
}
context.getCounter(MyCounters.HTML_PAGES).increment(1);
Int2ObjectMap> anchors = InvertAnchorText.extractLinks(content, url, fst);
for (Int2ObjectMap.Entry> entry : anchors.int2ObjectEntrySet()) {
key.set(entry.getIntKey());
for (String s : entry.getValue()) {
value.set(srcId + "\t" + s);
context.write(key, value);
}
context.getCounter(MyCounters.LINKS).increment(entry.getValue().size());
}
}
}
public static class InvertAnchorTextHBaseMapper extends TableMapper{
private final IntWritable key = new IntWritable();
private final Text value = new Text();
- private UriMapping fst;
+ private UrlMapping fst;
@Override
public void setup(Context context) {
try {
Configuration conf = context.getConfiguration();
@SuppressWarnings("deprecation")
Path[] localFiles = DistributedCache.getLocalCacheFiles(conf);
// load FST UriMapping from file
- fst = (UriMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
+ fst = (UrlMapping) Class.forName(conf.get("UriMappingClass")).newInstance();
fst.loadMapping(localFiles[0].toString());
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("Error Initializing UriMapping");
}
}
@Override
public void map(ImmutableBytesWritable row, Result result, Context context)
throws IOException, InterruptedException {
String url = UrlUtil.keyToUrl(new String(row.get()));
int srcId = fst.getID(url);
if ( srcId == -1) {
return;
}
for (KeyValue kv : result.list()) {
String type = new String(kv.getQualifier());
context.getCounter(MyCounters.RECORDS).increment(1);
if (!type.equals("text/html")) {
continue;
}
context.getCounter(MyCounters.HTML_PAGES).increment(1);
InputStream content = new ByteArrayInputStream(kv.getValue());
Int2ObjectMap> anchors = InvertAnchorText.extractLinks(content, url, fst);
for (Int2ObjectMap.Entry> entry : anchors.int2ObjectEntrySet()) {
key.set(entry.getIntKey());
for (String s : entry.getValue()) {
value.set(srcId + "\t" + s);
context.write(key, value);
}
context.getCounter(MyCounters.LINKS).increment(entry.getValue().size());
}
}
}
}
/**
* Creates an instance of this tool.
*/
public InvertAnchorText() {}
private static final String HDFS = "hdfs";
private static final String HBASE = "hbase";
private static final String OUTPUT = "output";
private static final String URI_MAPPING = "uriMapping";
private static final String BEGIN = "begin";
private static final String END = "end";
private static final String NUM_REDUCERS = "numReducers";
private static String beginDate = null, endDate = null;
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("HDFS input path").create(HDFS));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("HBASE table name").create(HBASE));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("output path").create(OUTPUT));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("uri mapping file path").create(URI_MAPPING));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("begin date (optional)").create(BEGIN));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("end date (optional)").create(END));
options.addOption(OptionBuilder.withArgName("num").hasArg()
.withDescription("number of reducers").create(NUM_REDUCERS));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if ( (!cmdline.hasOption(HDFS) && !cmdline.hasOption(HBASE)) // No HDFS and HBase input
|| !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(URI_MAPPING)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
FileSystem fs = FileSystem.get(getConf());
String path = null, table = null;
boolean isHdfs;
if (cmdline.hasOption(HDFS)) {
path = cmdline.getOptionValue(HDFS);
isHdfs = true;
} else {
table = cmdline.getOptionValue(HBASE);
isHdfs = false;
}
String outputPath = cmdline.getOptionValue(OUTPUT);
Path mappingPath = new Path(cmdline.getOptionValue(URI_MAPPING));
LOG.info("Tool: " + InvertAnchorText.class.getSimpleName());
if (isHdfs) {
LOG.info(" - HDFS input path: " + path);
} else {
LOG.info(" - HBase table name: " + table);
}
LOG.info(" - output path: " + outputPath);
LOG.info(" - mapping file path: " + mappingPath);
if (cmdline.hasOption(BEGIN)) {
beginDate = cmdline.getOptionValue(BEGIN);
LOG.info(" - begin date: " + beginDate);
}
if (cmdline.hasOption(END)) {
endDate = cmdline.getOptionValue(END);
LOG.info(" - end date: " + endDate);
}
if (!fs.exists(mappingPath)) {
throw new Exception("mappingPath doesn't exist: " + mappingPath);
}
Configuration conf;
if (isHdfs) {
conf = getConf();
} else {
conf = HBaseConfiguration.create(getConf());
conf.set("hbase.zookeeper.quorum", "bespinrm.umiacs.umd.edu");
}
Job job = Job.getInstance(conf, InvertAnchorText.class.getSimpleName() +
(isHdfs ? ":HDFS:" + path : ":HBase:" + table));
job.setJarByClass(InvertAnchorText.class);
- job.getConfiguration().set("UriMappingClass", UriMapping.class.getCanonicalName());
+ job.getConfiguration().set("UriMappingClass", UrlMapping.class.getCanonicalName());
// Put the mapping file in the distributed cache so each map worker will have it.
job.addCacheFile(mappingPath.toUri());
int numReducers = cmdline.hasOption(NUM_REDUCERS) ?
Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 100;
job.setNumReduceTasks(numReducers);
if (isHdfs) { // HDFS input
FileInputFormat.setInputPaths(job, new Path(path));
job.setInputFormatClass(ArcInputFormat.class);
// set map (key,value) output format
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(InvertAnchorTextHdfsMapper.class);
} else { // HBase input
Scan scan = new Scan();
scan.addFamily("c".getBytes());
// Very conservative settings because a single row might not fit in memory
// if we have many captured version of a URL.
scan.setCaching(1); // Controls the number of rows to pre-fetch
scan.setBatch(10); // Controls the number of columns to fetch on a per row basis
scan.setCacheBlocks(false); // Don't set to true for MR jobs
scan.setMaxVersions(); // We want all versions
TableMapReduceUtil.initTableMapperJob(
table, // input HBase table name
scan, // Scan instance to control CF and attribute selection
InvertAnchorTextHBaseMapper.class, // mapper
IntWritable.class, // mapper output key
Text.class, // mapper output value
job);
job.setOutputFormatClass(TextOutputFormat.class); // set output format
}
FileOutputFormat.setOutputPath(job, new Path(outputPath));
// Delete the output directory if it exists already.
Path outputDir = new Path(outputPath);
fs.delete(outputDir, true);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
Counters counters = job.getCounters();
int numRecords = (int) counters.findCounter(MyCounters.RECORDS).getValue();
int numLinks = (int) counters.findCounter(MyCounters.LINKS).getValue();
LOG.info("Read " + numRecords + " records.");
LOG.info("Extracts " + numLinks + " links.");
return 0;
}
/**
* Dispatches command-line arguments to the tool via the {@code ToolRunner}.
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new InvertAnchorText(), args);
}
}
\ No newline at end of file
diff --git a/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java b/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java
index 7d8de31..902dab8 100644
--- a/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java
+++ b/src/main/java/org/warcbase/analysis/graph/PrefixMapping.java
@@ -1,93 +1,93 @@
package org.warcbase.analysis.graph;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
-import org.warcbase.data.UriMapping;
+import org.warcbase.data.UrlMapping;
import au.com.bytecode.opencsv.CSVReader;
public class PrefixMapping {
public class PrefixNode {
int id;
String url;
int startPos;
int endPos;
public PrefixNode(int id, String url, int startPos, int endPos) {
this.id = id;
this.url = url;
this.startPos = startPos;
this.endPos = endPos;
}
public int getId() {
return id;
}
public String getUrl() {
return url;
}
public int getStartPos() {
return startPos;
}
public int getEndPos() {
return endPos;
}
}
- public static ArrayList loadPrefix(String prefixFile, UriMapping map)
+ public static ArrayList loadPrefix(String prefixFile, UrlMapping map)
throws IOException {
PrefixMapping instance = new PrefixMapping();
final Comparator comparator = new Comparator() {
@Override
public int compare(PrefixNode n1, PrefixNode n2) {
if (n1.startPos > n2.startPos) {
return 1;
} else if (n1.startPos == n2.startPos) {
return 0;
} else {
return -1;
}
}
};
ArrayList prefixes = new ArrayList();
CSVReader reader = new CSVReader(new FileReader(prefixFile), ',');
reader.readNext();
String line;
String[] record = null;
while ((record = reader.readNext()) != null) {
int id = Integer.valueOf(record[0]);
String url = record[1];
List results = map.prefixSearch(url);
int[] boundary = map.getIdRange(results.get(0), results.get(results.size() - 1));
PrefixNode node = instance.new PrefixNode(id, url, boundary[0], boundary[1]);
prefixes.add(node);
}
Collections.sort(prefixes, comparator);
reader.close();
return prefixes;
}
public int getPrefixId(int id, ArrayList prefixes) {
int start = 0, end = prefixes.size() - 1;
int mid;
while (start <= end) {
mid = (start + end) / 2;
if (prefixes.get(mid).getStartPos() <= id && prefixes.get(mid).getEndPos() >= id) {
return prefixes.get(mid).getId();
} else if (prefixes.get(mid).getStartPos() > id) {
end = mid - 1;
} else {
start = mid + 1;
}
}
return -1;
}
}
\ No newline at end of file
diff --git a/src/main/java/org/warcbase/data/UriMapping.java b/src/main/java/org/warcbase/data/UrlMapping.java
similarity index 95%
rename from src/main/java/org/warcbase/data/UriMapping.java
rename to src/main/java/org/warcbase/data/UrlMapping.java
index c90ceff..c29721e 100644
--- a/src/main/java/org/warcbase/data/UriMapping.java
+++ b/src/main/java/org/warcbase/data/UrlMapping.java
@@ -1,237 +1,237 @@
package org.warcbase.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.log4j.Logger;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.warcbase.ingest.IngestFiles;
-public class UriMapping {
- private static final Logger LOG = Logger.getLogger(UriMapping.class);
+public class UrlMapping {
+ private static final Logger LOG = Logger.getLogger(UrlMapping.class);
private FST fst;
- public UriMapping(FST fst) {
+ public UrlMapping(FST fst) {
this.fst = fst;
}
- public UriMapping() {
+ public UrlMapping() {
}
- public UriMapping(String outputFileName) {
+ public UrlMapping(String outputFileName) {
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
File outputFile = new File(outputFileName);
try {
this.fst = FST.read(outputFile, outputs);
} catch (IOException e) {
LOG.error("Build FST Failed!");
e.printStackTrace();
}
}
public void loadMapping(String outputFileName) {
- UriMapping tmp = new UriMapping(outputFileName);
+ UrlMapping tmp = new UrlMapping(outputFileName);
this.fst = tmp.fst;
}
public FST getFst() {
return fst;
}
public int getID(String url) {
Long id = null;
try {
id = Util.get(fst, new BytesRef(url));
} catch (IOException e) {
// Log error, but assume that URL doesn't exist.
LOG.error("Error fetching " + url);
e.printStackTrace();
return -1;
}
return id == null ? -1 : id.intValue();
}
public String getUrl(int id) {
BytesRef scratchBytes = new BytesRef();
IntsRef key = null;
try {
key = Util.getByOutput(fst, id);
} catch (IOException e) {
LOG.error("Error id " + id);
e.printStackTrace();
return null;
}
if (key == null) {
return null;
}
return Util.toBytesRef(key, scratchBytes).utf8ToString();
}
public List prefixSearch(String prefix) {
if (prefix == null || prefix.length() == 0 ) {
return new ArrayList();
}
List strResults = null;
try {
// descend to the arc of the prefix string
Arc arc = fst.getFirstArc(new Arc());
BytesReader fstReader = fst.getBytesReader();
BytesRef bref = new BytesRef(prefix);
for (int i = 0; i < bref.length; i++) {
Arc retArc = fst.findTargetArc(bref.bytes[i + bref.offset] & 0xFF, arc, arc, fstReader);
if (retArc == null) { // no matched prefix
return new ArrayList();
}
}
// collect all substrings started from the arc of prefix string.
List result = new ArrayList();
BytesRef newPrefixBref = new BytesRef(prefix.substring(0, prefix.length() - 1));
collect(result, fstReader, newPrefixBref, arc);
// convert BytesRef results to String results
strResults = new ArrayList();
Iterator iter = result.iterator();
while (iter.hasNext()) {
strResults.add(iter.next().utf8ToString());
}
} catch (IOException e) {
LOG.error("Error: " + e);
e.printStackTrace();
return new ArrayList();
}
return strResults;
}
public int[] getIdRange(String first, String last){
if (first == null || last == null) {
return null;
}
Long startId = null, endId = null;
try {
startId = Util.get(fst, new BytesRef(first));
endId = Util.get(fst, new BytesRef(last));
if (startId == null || endId == null) {
return null;
}
} catch (IOException e) {
LOG.error("Error: " + e);
e.printStackTrace();
return null;
}
return new int[] { (int) startId.longValue(), (int) endId.longValue() };
}
private boolean collect(List res, BytesReader fstReader,
BytesRef output, Arc arc) throws IOException {
if (output.length == output.bytes.length) {
output.bytes = ArrayUtil.grow(output.bytes);
}
assert output.offset == 0;
output.bytes[output.length++] = (byte) arc.label;
fst.readFirstTargetArc(arc, arc, fstReader);
while (true) {
if (arc.label == FST.END_LABEL) {
res.add(BytesRef.deepCopyOf(output));
} else {
int save = output.length;
if (collect(res, fstReader, output, new Arc().copyFrom(arc))) {
return true;
}
output.length = save;
}
if (arc.isLast()) {
break;
}
fst.readNextArc(arc, fstReader);
}
return false;
}
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
final String DATA = "data";
final String ID = "getId";
final String URL = "getUrl";
final String PREFIX = "getPrefix";
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("FST data file").create(DATA));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("get id").create(ID));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("get url").create(URL));
options.addOption(OptionBuilder.withArgName("path").hasArg()
.withDescription("get prefix").create(PREFIX));
CommandLine cmdline = null;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: "
+ exp.getMessage());
System.exit(-1);
}
if (!cmdline.hasOption(DATA) || (!cmdline.hasOption(ID)
&& !cmdline.hasOption(URL) && !cmdline.hasOption(PREFIX))) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp(IngestFiles.class.getCanonicalName(), options);
System.exit(-1);
}
String filePath = cmdline.getOptionValue(DATA);
- UriMapping map = new UriMapping(filePath);
+ UrlMapping map = new UrlMapping(filePath);
map.loadMapping(filePath);
if (cmdline.hasOption(ID)) {
String url = cmdline.getOptionValue(ID);
System.out.println(map.getID(url));
}
if (cmdline.hasOption(URL)) {
int id = Integer.parseInt(cmdline.getOptionValue(URL));
System.out.println(map.getUrl(id));
}
if (cmdline.hasOption(PREFIX)) {
String prefix = cmdline.getOptionValue(PREFIX);
List urls = map.prefixSearch(prefix);
for (String s : urls) {
System.out.println(s);
}
}
}
}
diff --git a/src/main/java/org/warcbase/data/UriMappingBuilder.java b/src/main/java/org/warcbase/data/UrlMappingBuilder.java
similarity index 98%
rename from src/main/java/org/warcbase/data/UriMappingBuilder.java
rename to src/main/java/org/warcbase/data/UrlMappingBuilder.java
index 3b7f298..a2a8455 100644
--- a/src/main/java/org/warcbase/data/UriMappingBuilder.java
+++ b/src/main/java/org/warcbase/data/UrlMappingBuilder.java
@@ -1,91 +1,91 @@
package org.warcbase.data;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
-public class UriMappingBuilder {
+public class UrlMappingBuilder {
private static void readUrlFromFile(File f, List urls) throws IOException {
String contents = FileUtils.readFileToString(f);
String[] lines = contents.split("\\n");
for (String line : lines) {
// This need to modify according to your input file
if (!line.equals("")) { // non-empty string
String url = line.split("\\s+")[0];
urls.add(url);
}
}
}
private static List readUrlFromFolder(String folderName) throws IOException {
File folder = new File(folderName);
List urls = new ArrayList();
if (folder.isDirectory()) {
for (File file : folder.listFiles()) {
readUrlFromFile(file, urls);
}
} else {
readUrlFromFile(folder, urls);
}
Collections.sort(urls); // sort String according to url alphabetical order
return urls;
}
public static void main(String[] args) throws IOException {
String inputFileName = new String();
String outputFileName = new String();
if (args.length > 0) { // read file name from main arguments
inputFileName = args[0];
outputFileName = args[1];
}
List inputValues = null;
try {
// input strings must be sorted in Unicode order
inputValues = readUrlFromFolder(inputFileName); // read data
} catch (IOException e) {
e.printStackTrace();
}
// Be Careful about the file size
long size = inputValues.size();
List outputValues = new ArrayList(); // create the mapping id
for (long i = 1; i <= size; i++) {
outputValues.add(i);
}
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder builder = new Builder(INPUT_TYPE.BYTE1, outputs);
BytesRef scratchBytes = new BytesRef();
IntsRef scratchInts = new IntsRef();
for (int i = 0; i < size; i++) {
scratchBytes.copyChars((String) inputValues.get(i));
try {
// Mapping!
builder.add(Util.toIntsRef(scratchBytes, scratchInts), (Long) outputValues.get(i));
} catch (UnsupportedOperationException e) {
System.out.println("Duplicate Url:" + inputValues.get(i));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
FST fst = builder.finish();
// Save FST to file
File outputFile = new File(outputFileName);
fst.save(outputFile);
}
}
diff --git a/src/main/java/org/warcbase/data/UriMappingMapReduceBuilder.java b/src/main/java/org/warcbase/data/UrlMappingMapReduceBuilder.java
similarity index 94%
rename from src/main/java/org/warcbase/data/UriMappingMapReduceBuilder.java
rename to src/main/java/org/warcbase/data/UrlMappingMapReduceBuilder.java
index e57a757..5980460 100644
--- a/src/main/java/org/warcbase/data/UriMappingMapReduceBuilder.java
+++ b/src/main/java/org/warcbase/data/UrlMappingMapReduceBuilder.java
@@ -1,224 +1,224 @@
package org.warcbase.data;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.jwat.arc.ArcRecordBase;
import org.warcbase.mapreduce.ArcInputFormat;
-public class UriMappingMapReduceBuilder extends Configured implements Tool {
- private static final Logger LOG = Logger.getLogger(UriMappingMapReduceBuilder.class);
+public class UrlMappingMapReduceBuilder extends Configured implements Tool {
+ private static final Logger LOG = Logger.getLogger(UrlMappingMapReduceBuilder.class);
private static enum Records {
TOTAL, RECORD_COUNT
};
public static class UriMappingBuilderMapper extends
Mapper {
public static final Text KEY = new Text();
public static final Text VALUE = new Text();
public void map(LongWritable key, ArcRecordBase record, Context context) throws IOException,
InterruptedException {
context.getCounter(Records.TOTAL).increment(1);
String url = record.getUrlStr();
String type = record.getContentTypeStr();
if (!type.equals("text/html")) {
return;
}
KEY.set(url);
context.write(KEY, VALUE);
}
}
public static class UriMappingBuilderReducer extends
Reducer {
public static List urls = new ArrayList();
private static String path;
// read PATH environment
@Override
public void setup(Context context) {
Configuration conf = context.getConfiguration();
path = conf.get("PATH");
}
@Override
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
context.getCounter(Records.RECORD_COUNT).increment(1);
urls.add(key.toString());
}
@Override
public void cleanup(Context context) throws IOException {
long size = urls.size();
LongList outputValues = new LongArrayList(); // create the mapping id
for (long i = 1; i <= size; i++) {
outputValues.add(i);
}
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder builder = new Builder(INPUT_TYPE.BYTE1, outputs);
BytesRef scratchBytes = new BytesRef();
IntsRef scratchInts = new IntsRef();
for (int i = 0; i < size; i++) {
scratchBytes.copyChars((String) urls.get(i));
try {
builder.add(Util.toIntsRef(scratchBytes, scratchInts), (Long) outputValues.get(i));
} catch (UnsupportedOperationException e) {
LOG.error("Duplicate URL:" + urls.get(i));
} catch (IOException e) {
LOG.error(e.getMessage());
e.printStackTrace();
}
}
FST fst = builder.finish();
LOG.info("PATH: " + path);
// Delete the output directory if it exists already.
Path outputDir = new Path(path);
FileSystem.get(context.getConfiguration()).delete(outputDir, true);
// Save FST to file
FileSystem fs = FileSystem.get(context.getConfiguration());
Path fstPath = new Path(path);
OutputStream fStream = fs.create(fstPath);
OutputStreamDataOutput fstStream = new OutputStreamDataOutput(fStream);
boolean success = false;
try {
fst.save(fstStream);
success = true;
} finally {
if (success) {
IOUtils.close(fstStream);
} else {
IOUtils.closeWhileHandlingException(fstStream);
}
}
}
}
- public UriMappingMapReduceBuilder() {}
+ public UrlMappingMapReduceBuilder() {}
private static final String INPUT = "input";
private static final String OUTPUT = "output";
/**
* Runs this tool.
*/
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
Options options = new Options();
options.addOption(OptionBuilder.withArgName("path")
.hasArg().withDescription("input path").create(INPUT));
options.addOption(OptionBuilder.withArgName("path")
.hasArg().withDescription("output path").create(OUTPUT));
CommandLine cmdline;
CommandLineParser parser = new GnuParser();
try {
cmdline = parser.parse(options, args);
} catch (ParseException exp) {
System.err.println("Error parsing command line: " + exp.getMessage());
return -1;
}
if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
System.out.println("args: " + Arrays.toString(args));
HelpFormatter formatter = new HelpFormatter();
formatter.setWidth(120);
formatter.printHelp(this.getClass().getName(), options);
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
String inputPath = cmdline.getOptionValue(INPUT);
String outputPath = cmdline.getOptionValue(OUTPUT);
LOG.info("- input path: " + inputPath);
LOG.info("- output path: " + outputPath);
Configuration conf = getConf();
conf.set("PATH", outputPath);
conf.set("mapreduce.reduce.java.opts", "-Xmx5120m");
- Job job = Job.getInstance(conf, UriMappingMapReduceBuilder.class.getSimpleName());
- job.setJarByClass(UriMappingMapReduceBuilder.class);
+ Job job = Job.getInstance(conf, UrlMappingMapReduceBuilder.class.getSimpleName());
+ job.setJarByClass(UrlMappingMapReduceBuilder.class);
job.getConfiguration().set("UriMappingBuilderClass",
- UriMappingMapReduceBuilder.class.getCanonicalName());
+ UrlMappingMapReduceBuilder.class.getCanonicalName());
FileInputFormat.setInputPaths(job, new Path(inputPath));
job.setInputFormatClass(ArcInputFormat.class);
job.setOutputFormatClass(NullOutputFormat.class); // no output
// set map (key,value) output format
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setMapperClass(UriMappingBuilderMapper.class);
job.setReducerClass(UriMappingBuilderReducer.class);
// all the keys are shuffled to a single reducer
job.setNumReduceTasks(1);
long startTime = System.currentTimeMillis();
job.waitForCompletion(true);
LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
Counters counters = job.getCounters();
int numRecords = (int) counters.findCounter(Records.TOTAL).getValue();
int numUrls = (int) counters.findCounter(Records.RECORD_COUNT).getValue();
LOG.info("Read " + numRecords + " records.");
LOG.info("Encountered " + numUrls + " unique urls.");
return 0;
}
public static void main(String[] args) throws Exception {
- ToolRunner.run(new UriMappingMapReduceBuilder(), args);
+ ToolRunner.run(new UrlMappingMapReduceBuilder(), args);
}
}
diff --git a/src/test/java/org/warcbase/data/UriMappingTest.java b/src/test/java/org/warcbase/data/UriMappingTest.java
index 4414853..ad3e867 100644
--- a/src/test/java/org/warcbase/data/UriMappingTest.java
+++ b/src/test/java/org/warcbase/data/UriMappingTest.java
@@ -1,136 +1,136 @@
package org.warcbase.data;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.List;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.INPUT_TYPE;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
import org.junit.Before;
import org.junit.Test;
// This class aims to test the PrefixSearch functionality.
public class UriMappingTest {
- private UriMapping map;
+ private UrlMapping map;
@Before
public void setUp() throws Exception {
String inputValues[] = { "cat", "catch", "cut", "doga", "dogb", "dogs" };
long outputValues[] = { 1, 2, 3, 4, 5, 6 };
PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
Builder builder = new Builder(INPUT_TYPE.BYTE1, outputs);
BytesRef scratchBytes = new BytesRef();
IntsRef scratchInts = new IntsRef();
for (int i = 0; i < inputValues.length; i++) {
scratchBytes.copyChars(inputValues[i]);
builder.add(Util.toIntsRef(scratchBytes, scratchInts), outputValues[i]);
}
FST fst = builder.finish();
- map = new UriMapping(fst);
+ map = new UrlMapping(fst);
}
@Test
public void testGetIds() {
assertEquals(-1, map.getID("apple"));
assertEquals(1, map.getID("cat"));
assertEquals(2, map.getID("catch"));
assertEquals(3, map.getID("cut"));
assertEquals(-1, map.getID("cuttery"));
assertEquals(4, map.getID("doga"));
assertEquals(5, map.getID("dogb"));
assertEquals(6, map.getID("dogs"));
assertEquals(-1, map.getID("dogz"));
}
@Test
public void testGetUrls() {
assertEquals(null, map.getUrl(0));
assertEquals("cat", map.getUrl(1));
assertEquals("catch", map.getUrl(2));
assertEquals("cut", map.getUrl(3));
assertEquals("doga", map.getUrl(4));
assertEquals("dogb", map.getUrl(5));
assertEquals("dogs", map.getUrl(6));
assertEquals(null, map.getUrl(7));
}
@Test
public void testPrefixSearch() {
List results;
results = map.prefixSearch("cut");
assertEquals(1, results.size());
assertEquals("cut", results.get(0));
results = map.prefixSearch("dog");
assertEquals(3, results.size());
assertEquals("doga", results.get(0));
assertEquals("dogb", results.get(1));
assertEquals("dogs", results.get(2));
results = map.prefixSearch("");
assertEquals(0, results.size());
results = map.prefixSearch(null);
assertEquals(0, results.size());
results = map.prefixSearch("dad");
assertEquals(0, results.size());
}
@Test
public void testGetIdRange() throws IOException{
int[] range;
range = map.getIdRange("doga", "dogs");
assertEquals(4, range[0]);
assertEquals(6, range[1]);
assertEquals("doga", map.getUrl(range[0]));
assertEquals("dogs", map.getUrl(range[1]));
range = map.getIdRange("doga", "dogb");
assertEquals(4, range[0]);
assertEquals(5, range[1]);
assertEquals("doga", map.getUrl(range[0]));
assertEquals("dogb", map.getUrl(range[1]));
range = map.getIdRange("dogs", "dogs");
assertEquals(6, range[0]);
assertEquals(6, range[1]);
assertEquals("dogs", map.getUrl(range[0]));
assertEquals("dogs", map.getUrl(range[1]));
// If either one of the bounds is invalid, expect null
range = map.getIdRange("dog", "dogx");
assertEquals(null, range);
range = map.getIdRange("doga", "dogx");
assertEquals(null, range);
range = map.getIdRange("dog", "dogs");
assertEquals(null, range);
range = map.getIdRange("", "dogs");
assertEquals(null, range);
range = map.getIdRange("", "");
assertEquals(null, range);
range = map.getIdRange(null, "");
assertEquals(null, range);
range = map.getIdRange(null, null);
assertEquals(null, range);
range = map.getIdRange(null, null);
assertEquals(null, range);
}
}