diff --git a/pom.xml b/pom.xml index 152ed5f..94cf72e 100644 --- a/pom.xml +++ b/pom.xml @@ -1,30 +1,31 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.warcbase</groupId> <artifactId>warcbase</artifactId> <packaging>pom</packaging> <version>0.1.0-SNAPSHOT</version> <name>Warcbase</name> <description>An open-source platform for managing web archives built on Hadoop and HBase</description> <url>http://warcbase.org/</url> <licenses> <license> <name>The Apache Software License, Version 2.0</name> <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> <distribution>repo</distribution> </license> </licenses> <scm> <connection>scm:git:git@github.com:lintool/warcbase.git</connection> <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection> <url>git@github.com:lintool/warcbase.git</url> </scm> <modules> <module>warcbase-core</module> + <module>warcbase-hbase</module> </modules> </project> diff --git a/warcbase-hbase/pom.xml b/warcbase-hbase/pom.xml index f33d240..f367b80 100644 --- a/warcbase-hbase/pom.xml +++ b/warcbase-hbase/pom.xml @@ -1,539 +1,327 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <parent> <groupId>org.warcbase</groupId> <artifactId>warcbase</artifactId> <version>0.1.0-SNAPSHOT</version> </parent> <modelVersion>4.0.0</modelVersion> <groupId>org.warcbase</groupId> - <artifactId>warcbase-core</artifactId> + <artifactId>warcbase-hbase</artifactId> <packaging>jar</packaging> <version>0.1.0-SNAPSHOT</version> <name>Warcbase</name> <description>An open-source platform for managing web archives built on Hadoop and HBase</description> <url>http://warcbase.org/</url> <licenses> <license> <name>The Apache Software License, Version 2.0</name> <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> <distribution>repo</distribution> </license> </licenses> <scm> <connection>scm:git:git@github.com:lintool/warcbase.git</connection> <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection> <url>git@github.com:lintool/warcbase.git</url> </scm> <developers> <developer> <id>lintool</id> <name>Jimmy Lin</name> <email>jimmylin@umd.edu</email> </developer> <developer> <id>milad621</id> <name>Milad Gholami</name> <email>mgholami@cs.umd.edu</email> </developer> <developer> <id>jeffyRao</id> <name>Jinfeng Rao</name> <email>jinfeng@cs.umd.edu</email> </developer> </developers> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <jettyVersion>8.1.12.v20130726</jettyVersion> <hadoop.version>2.6.0-cdh5.4.1</hadoop.version> <hbase.version>1.0.0-cdh5.4.1</hbase.version> <zookeeper.version>3.4.5-cdh5.4.1</zookeeper.version> <spark.version>1.3.0-cdh5.4.1</spark.version> <scala.version>2.10.4</scala.version> </properties> <build> <plugins> <plugin> <artifactId>maven-clean-plugin</artifactId> <version>2.6.1</version> <configuration> <filesets> <fileset> <directory>src/main/solr/lib</directory> <followSymlinks>false</followSymlinks> </fileset> </filesets> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.2</version> <configuration> <source>1.7</source> <target>1.7</target> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.3</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <!-- http://mail-archives.apache.org/mod_mbox/lucene-java-user/201308.mbox/%3CWC20130822094206.310452@isped.u-bordeaux2.fr%3E --> <transformers><transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer"> <resource>META-INF/services/org.apache.lucene.codecs.Codec</resource></transformer></transformers> <!-- This fixes the issue "Invalid signature file digest for Manifest main attributes" cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html --> <filters> <filter> <artifact>*:*</artifact> <excludes> <exclude>META-INF/*.SF</exclude> <exclude>META-INF/*.DSA</exclude> <exclude>META-INF/*.RSA</exclude> </excludes> </filter> </filters> <!-- this will create both a normal thin jar and also a fatjar --> <shadedArtifactAttached>true</shadedArtifactAttached> <shadedClassifierName>fatjar</shadedClassifierName> <artifactSet> <excludes> <exclude>org.apache.hadoop:*</exclude> </excludes> </artifactSet> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-dependency-plugin</artifactId> <version>2.4</version> <executions> <execution> <id>copy</id> <phase>package</phase> <goals> <goal>copy-dependencies</goal> </goals> <configuration> <outputDirectory> src/main/solr/lib </outputDirectory> </configuration> </execution> </executions> </plugin> <plugin> <groupId>org.codehaus.mojo</groupId> <artifactId>appassembler-maven-plugin</artifactId> <version>1.9</version> <configuration> <extraJvmArguments>-Xms512M -Xmx24576M</extraJvmArguments> <programs> <program> <mainClass>org.warcbase.WarcbaseAdmin</mainClass> <name>WarcbaseAdmin</name> </program> <program> <mainClass>org.warcbase.data.UrlMappingBuilder</mainClass> <name>UrlMappingBuilder</name> </program> <program> <mainClass>org.warcbase.data.UrlMapping</mainClass> <name>UrlMapping</name> </program> <program> <mainClass>org.warcbase.data.ExtractLinks</mainClass> <name>ExtractLinks</name> </program> <program> <mainClass>org.warcbase.data.ExtractSiteLinks</mainClass> <name>ExtractSiteLinks</name> </program> <program> <mainClass>org.warcbase.ingest.IngestFiles</mainClass> <name>IngestFiles</name> </program> <program> <mainClass>org.warcbase.ingest.SearchForUrl</mainClass> <name>SearchForUrl</name> </program> <program> <mainClass>org.warcbase.browser.WarcBrowser</mainClass> <name>WarcBrowser</name> </program> <program> <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass> <name>DetectDuplicates</name> </program> <program> <mainClass>org.warcbase.browser.SeleniumBrowser</mainClass> <name>SeleniumBrowser</name> </program> </programs> </configuration> </plugin> <!-- for Scala --> <plugin> <groupId>org.scala-tools</groupId> <artifactId>maven-scala-plugin</artifactId> <version>2.15.2</version> <executions> <execution> <phase>process-resources</phase> <goals> <goal>add-source</goal> <goal>compile</goal> </goals> </execution> <execution> <id>scala-test-compile</id> <phase>process-test-resources</phase> <goals> <goal>testCompile</goal> </goals> </execution> </executions> <configuration> <scalaVersion>${scala.version}</scalaVersion> <sendJavaToScalac>true</sendJavaToScalac> <args> <arg>-target:jvm-1.7</arg> <arg>-g:vars</arg> <arg>-deprecation</arg> <arg>-dependencyfile</arg> <arg>${project.build.directory}/.scala_dependencies</arg> </args> </configuration> </plugin> </plugins> </build> <repositories> <repository> <id>maven</id> <url>http://repo.maven.apache.org/maven2/</url> </repository> <repository> <id>cloudera</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> </repository> <repository> <id>internetarchive</id> <name>Internet Archive Maven Repository</name> <url>http://builds.archive.org:8080/maven2</url> </repository> </repositories> <dependencies> <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>4.12</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.scalatest</groupId> - <artifactId>scalatest_2.10</artifactId> - <version>2.2.4</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - <version>1.8</version> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <version>2.4</version> - </dependency> - <dependency> - <groupId>org.jsoup</groupId> - <artifactId>jsoup</artifactId> - <version>1.7.3</version> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <version>14.0.1</version> - <!-- downgrade for Hadoop WARC indexer, see also https://issues.apache.org/jira/browse/HADOOP-10961 --> - </dependency> - <dependency> - <groupId>tl.lin</groupId> - <artifactId>lintools-datatypes</artifactId> - <version>1.0.0</version> + <groupId>org.warcbase</groupId> + <artifactId>warcbase-core</artifactId> + <version>0.1.0-SNAPSHOT</version> </dependency> - <!-- Begin: Hadoop-related dependencies --> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-client</artifactId> <version>${hbase.version}</version> <exclusions> <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion> </exclusions> </dependency> <dependency> <groupId>org.apache.hbase</groupId> <artifactId>hbase-server</artifactId> <version>${hbase.version}</version> <exclusions> <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion> <exclusion><groupId>org.mortbay.jetty</groupId><artifactId>servlet-api-2.5</artifactId></exclusion> <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion> <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion> </exclusions> </dependency> - <!-- See http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH-Version-and-Packaging-Information/cdhvd_hadoop_api_dependencies.html --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <version>${hadoop.version}</version> - <exclusions> - <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion> - </exclusions> - </dependency> - <dependency> <groupId>org.apache.zookeeper</groupId> <artifactId>zookeeper</artifactId> <version>${zookeeper.version}</version> </dependency> - <!-- End: Hadoop-related dependencies --> - - <dependency> - <groupId>org.netpreserve.openwayback</groupId> - <artifactId>openwayback-core</artifactId> - <version>2.0.0.BETA.2</version> - <exclusions> - <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion> - <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion> - <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-cdx-server</artifactId></exclusion> - <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-access-control-core</artifactId></exclusion> - <!-- swap in our own latest versions --> - <exclusion><groupId>it.unimi.dsi</groupId><artifactId>dsiutils</artifactId></exclusion> - <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.netpreserve.commons</groupId> - <artifactId>webarchive-commons</artifactId> - <version>1.1.4</version> - <exclusions> - <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion> - <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion> - <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>it.unimi.dsi</groupId> - <artifactId>dsiutils</artifactId> - <version>2.2.0</version> - <exclusions> - <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion> - <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>it.unimi.dsi</groupId> - <artifactId>fastutil</artifactId> - <version>6.5.15</version> - <exclusions> - <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.eclipse.jetty</groupId> - <artifactId>jetty-server</artifactId> - <version>${jettyVersion}</version> - </dependency> - <dependency> - <groupId>org.eclipse.jetty</groupId> - <artifactId>jetty-webapp</artifactId> - <version>${jettyVersion}</version> - <optional>true</optional> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - <version>1.6.4</version> - </dependency> - - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - <version>3.0</version> - </dependency> - <dependency> - <groupId>commons-cli</groupId> - <artifactId>commons-cli</artifactId> - <version>1.2</version> - </dependency> - - <dependency> - <groupId>net.sf.opencsv</groupId> - <artifactId>opencsv</artifactId> - <version>2.3</version> - </dependency> - - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - <version>1.9</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> - <version>1.9</version> - </dependency> - <dependency> - <groupId>org.antlr</groupId> - <artifactId>antlr</artifactId> - <version>3.5.2</version> - </dependency> - - <dependency> - <groupId>org.seleniumhq.selenium</groupId> - <artifactId>selenium-java</artifactId> - <version>2.42.2</version> - <exclusions> - <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-htmlunit-driver</artifactId></exclusion> - <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-ie-driver</artifactId></exclusion> - <exclusion><groupId>org.webbitserver</groupId><artifactId>webbit</artifactId></exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>2.10.4</version> - </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-core_2.10</artifactId> - <version>${spark.version}</version> + <groupId>uk.bl.wa.discovery</groupId> + <artifactId>warc-hadoop-indexer</artifactId> + <version>2.2.0-BETA-5</version> <exclusions> + <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion> <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion> - <exclusion><groupId>org.xerial.snappy</groupId><artifactId>snappy-java</artifactId> - </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.apache.spark</groupId> - <artifactId>spark-graphx_2.10</artifactId> - <version>${spark.version}</version> - </dependency> - - <dependency> - <groupId>com.chuusai</groupId> - <artifactId>shapeless_2.10.4</artifactId> - <version>2.0.0</version> - </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-core</artifactId> - <version>2.7.2</version> - </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - <version>2.7.2</version> - </dependency> - <dependency> - <groupId>org.json4s</groupId> - <artifactId>json4s-jackson_2.10</artifactId> - <version>3.2.10</version> - <!-- see this issue: http://stackoverflow.com/questions/32400061/spark-streaming-json4s-jackson-dependency-problems --> - </dependency> - - <dependency> - <groupId>com.typesafe</groupId> - <artifactId>config</artifactId> - <version>1.2.1</version> - </dependency> - - <dependency> - <groupId>org.xerial.snappy</groupId> - <artifactId>snappy-java</artifactId> - <version>1.0.5</version> - </dependency> - - <dependency> - <groupId>edu.stanford.nlp</groupId> - <artifactId>stanford-corenlp</artifactId> - <version>3.4.1</version> - </dependency> - - <dependency> - <groupId>com.syncthemall</groupId> - <artifactId>boilerpipe</artifactId> - <version>1.2.2</version> - </dependency> - - <dependency> - <groupId>xerces</groupId> - <artifactId>xercesImpl</artifactId> - <version>2.11.0</version> - </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>4.7.2</version> </dependency> <dependency> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> <version>4.7.2</version> <exclusions> <exclusion><artifactId>slf4j-api</artifactId><groupId>org.slf4j</groupId></exclusion> <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-annotations</groupId></exclusion> <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-common</groupId></exclusion> <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-hdfs</groupId></exclusion> <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion> </exclusions> </dependency> <dependency> - <groupId>uk.bl.wa.discovery</groupId> - <artifactId>warc-hadoop-indexer</artifactId> - <version>2.2.0-BETA-5</version> + <groupId>org.seleniumhq.selenium</groupId> + <artifactId>selenium-java</artifactId> + <version>2.42.2</version> <exclusions> - <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion> - <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion> + <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-htmlunit-driver</artifactId></exclusion> + <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-ie-driver</artifactId></exclusion> + <exclusion><groupId>org.webbitserver</groupId><artifactId>webbit</artifactId></exclusion> </exclusions> </dependency> + </dependencies> </project>