diff --git a/pom.xml b/pom.xml index 94cf72e..22e30f0 100644 --- a/pom.xml +++ b/pom.xml @@ -1,31 +1,41 @@ 4.0.0 org.warcbase warcbase pom 0.1.0-SNAPSHOT Warcbase - An open-source platform for managing web archives built on Hadoop and HBase + An open-source platform for managing and analyzing web archives http://warcbase.org/ + + UTF-8 + UTF-8 + 2.10.5 + 2.6.0-cdh5.4.1 + 1.3.0-cdh5.4.1 + 1.0.0-cdh5.4.1 + 3.4.5-cdh5.4.1 + + The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo scm:git:git@github.com:lintool/warcbase.git scm:git:git@github.com:lintool/warcbase.git git@github.com:lintool/warcbase.git warcbase-core warcbase-hbase diff --git a/warcbase-core/pom.xml b/warcbase-core/pom.xml index db54b2c..5e38055 100644 --- a/warcbase-core/pom.xml +++ b/warcbase-core/pom.xml @@ -1,447 +1,228 @@ org.warcbase warcbase 0.1.0-SNAPSHOT 4.0.0 org.warcbase warcbase-core jar 0.1.0-SNAPSHOT - Warcbase - An open-source platform for managing web archives built on Hadoop and HBase + Warcbase (Core) + An open-source platform for managing and analyzing web archives http://warcbase.org/ The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo scm:git:git@github.com:lintool/warcbase.git scm:git:git@github.com:lintool/warcbase.git git@github.com:lintool/warcbase.git - - - lintool - Jimmy Lin - jimmylin@umd.edu - - - milad621 - Milad Gholami - mgholami@cs.umd.edu - - - jeffyRao - Jinfeng Rao - jinfeng@cs.umd.edu - - - - - UTF-8 - UTF-8 - 8.1.12.v20130726 - 2.6.0-cdh5.4.1 - 1.0.0-cdh5.4.1 - 3.4.5-cdh5.4.1 - 1.3.0-cdh5.4.1 - 2.10.4 - - - - - org.apache.maven.plugins maven-compiler-plugin 3.2 1.7 1.7 org.apache.maven.plugins maven-shade-plugin 2.3 package shade - - - - - META-INF/services/org.apache.lucene.codecs.Codec - + + + META-INF/services/org.apache.lucene.codecs.Codec + + *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA true fatjar org.apache.hadoop:* + org.apache.spark:* - - - - org.codehaus.mojo - appassembler-maven-plugin - 1.9 - - -Xms512M -Xmx24576M - - - org.warcbase.WarcbaseAdmin - WarcbaseAdmin - - - org.warcbase.data.UrlMappingBuilder - UrlMappingBuilder - - - org.warcbase.data.UrlMapping - UrlMapping - - - org.warcbase.data.ExtractLinks - ExtractLinks - - - org.warcbase.data.ExtractSiteLinks - ExtractSiteLinks - - - org.warcbase.ingest.IngestFiles - IngestFiles - - - org.warcbase.ingest.SearchForUrl - SearchForUrl - - - org.warcbase.browser.WarcBrowser - WarcBrowser - - - org.warcbase.analysis.DetectDuplicates - DetectDuplicates - - - org.warcbase.browser.SeleniumBrowser - SeleniumBrowser - - - - org.scala-tools maven-scala-plugin 2.15.2 process-resources add-source compile scala-test-compile process-test-resources testCompile ${scala.version} true -target:jvm-1.7 -g:vars -deprecation -dependencyfile ${project.build.directory}/.scala_dependencies maven http://repo.maven.apache.org/maven2/ cloudera https://repository.cloudera.com/artifactory/cloudera-repos/ internetarchive Internet Archive Maven Repository http://builds.archive.org:8080/maven2 junit junit 4.12 test org.scalatest scalatest_2.10 - 2.2.4 + 2.2.5 test - org.jsoup - jsoup - 1.7.3 + org.scala-lang + scala-library + ${scala.version} + + + com.chuusai + shapeless_2.10.5 + 2.0.0 + + + + org.apache.spark + spark-core_2.10 + ${spark.version} + + + org.apache.spark + spark-graphx_2.10 + ${spark.version} com.google.guava guava 14.0.1 - + - tl.lin - lintools-datatypes - 1.0.0 + org.xerial.snappy + snappy-java + 1.0.5 - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - - javax.servletservlet-api - + org.jsoup + jsoup + 1.7.3 - - org.netpreserve.openwayback openwayback-core 2.0.0.BETA.2 - - org.apache.hadoophadoop-core - ch.qos.logbacklogback-classic - org.netpreserve.openwaybackopenwayback-cdx-server - org.netpreserve.openwaybackopenwayback-access-control-core - - it.unimi.dsidsiutils - fastutilfastutil - org.netpreserve.commons webarchive-commons 1.1.4 - - org.apache.hadoophadoop-core - commons-langcommons-lang - fastutilfastutil - - - - - org.apache.commons - commons-lang3 - 3.0 - - - commons-codec - commons-codec - 1.8 - - - commons-io - commons-io - 2.4 - - - commons-cli - commons-cli - 1.2 - org.apache.tika tika-core 1.9 org.apache.tika tika-parsers 1.9 - org.antlr - antlr - 3.5.2 - - - - org.scala-lang - scala-library - 2.10.4 - - - org.apache.spark - spark-core_2.10 - ${spark.version} - - com.typesafeconfig - org.xerial.snappysnappy-java - - - - - org.apache.spark - spark-graphx_2.10 - ${spark.version} - - - - com.chuusai - shapeless_2.10.4 - 2.0.0 - - - com.fasterxml.jackson.core - jackson-core - 2.7.2 - - - com.fasterxml.jackson.core - jackson-databind - 2.7.2 - - - org.json4s - json4s-jackson_2.10 - 3.2.10 - - - - - com.typesafe - config - 1.2.1 - - - - - - edu.stanford.nlp - stanford-corenlp - 3.4.1 - - - - com.syncthemall - boilerpipe - 1.2.2 - - - - xerces - xercesImpl - 2.11.0 + tl.lin + lintools-datatypes + 1.0.0 - diff --git a/warcbase-hbase/pom.xml b/warcbase-hbase/pom.xml index f367b80..b2b25e4 100644 --- a/warcbase-hbase/pom.xml +++ b/warcbase-hbase/pom.xml @@ -1,327 +1,250 @@ org.warcbase warcbase 0.1.0-SNAPSHOT 4.0.0 org.warcbase warcbase-hbase jar 0.1.0-SNAPSHOT - Warcbase - An open-source platform for managing web archives built on Hadoop and HBase + Warcbase (HBase) + An open-source platform for managing and analyzing web archives http://warcbase.org/ The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo scm:git:git@github.com:lintool/warcbase.git scm:git:git@github.com:lintool/warcbase.git git@github.com:lintool/warcbase.git - - - lintool - Jimmy Lin - jimmylin@umd.edu - - - milad621 - Milad Gholami - mgholami@cs.umd.edu - - - jeffyRao - Jinfeng Rao - jinfeng@cs.umd.edu - - - - - UTF-8 - UTF-8 - 8.1.12.v20130726 - 2.6.0-cdh5.4.1 - 1.0.0-cdh5.4.1 - 3.4.5-cdh5.4.1 - 1.3.0-cdh5.4.1 - 2.10.4 - - - - - maven-clean-plugin - 2.6.1 - - - - src/main/solr/lib - false - - - - - org.apache.maven.plugins maven-compiler-plugin 3.2 1.7 1.7 org.apache.maven.plugins maven-shade-plugin 2.3 package shade - - - - - META-INF/services/org.apache.lucene.codecs.Codec - + + + META-INF/services/org.apache.lucene.codecs.Codec + + + *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA true fatjar org.apache.hadoop:* - - org.apache.maven.plugins - maven-dependency-plugin - 2.4 - - + + org.apache.maven.plugins + maven-dependency-plugin + 2.4 + + copy package copy-dependencies src/main/solr/lib - - - + + + org.codehaus.mojo appassembler-maven-plugin 1.9 -Xms512M -Xmx24576M org.warcbase.WarcbaseAdmin WarcbaseAdmin org.warcbase.data.UrlMappingBuilder UrlMappingBuilder org.warcbase.data.UrlMapping UrlMapping org.warcbase.data.ExtractLinks ExtractLinks org.warcbase.data.ExtractSiteLinks ExtractSiteLinks org.warcbase.ingest.IngestFiles IngestFiles org.warcbase.ingest.SearchForUrl SearchForUrl org.warcbase.browser.WarcBrowser WarcBrowser org.warcbase.analysis.DetectDuplicates DetectDuplicates org.warcbase.browser.SeleniumBrowser SeleniumBrowser org.scala-tools maven-scala-plugin 2.15.2 process-resources add-source compile scala-test-compile process-test-resources testCompile ${scala.version} true -target:jvm-1.7 -g:vars -deprecation -dependencyfile ${project.build.directory}/.scala_dependencies maven http://repo.maven.apache.org/maven2/ cloudera https://repository.cloudera.com/artifactory/cloudera-repos/ internetarchive Internet Archive Maven Repository http://builds.archive.org:8080/maven2 org.warcbase warcbase-core 0.1.0-SNAPSHOT org.apache.hbase hbase-client ${hbase.version} - - org.apache.hadoophadoop-core - org.apache.hbase hbase-server ${hbase.version} - - org.apache.hadoophadoop-core - org.mortbay.jettyservlet-api-2.5 - javax.servletservlet-api - asmasm - - - - - org.apache.zookeeper - zookeeper - ${zookeeper.version} uk.bl.wa.discovery warc-hadoop-indexer 2.2.0-BETA-5 - - asmasm - com.typesafeconfig - org.apache.lucene lucene-core 4.7.2 org.apache.solr solr-core 4.7.2 - - slf4j-apiorg.slf4j - org.apache.hadoophadoop-annotations - org.apache.hadoophadoop-common - org.apache.hadoophadoop-hdfs - com.typesafeconfig - org.seleniumhq.selenium selenium-java 2.42.2 - - org.seleniumhq.seleniumselenium-htmlunit-driver - org.seleniumhq.seleniumselenium-ie-driver - org.webbitserverwebbit - -