diff --git a/pom.xml b/pom.xml index 4d56742..6505e1a 100644 --- a/pom.xml +++ b/pom.xml @@ -1,528 +1,533 @@ 4.0.0 org.warcbase warcbase jar 0.1.0-SNAPSHOT Warcbase An open-source platform for managing web archives built on Hadoop and HBase http://warcbase.org/ The Apache Software License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo scm:git:git@github.com:lintool/warcbase.git scm:git:git@github.com:lintool/warcbase.git git@github.com:lintool/warcbase.git lintool Jimmy Lin jimmylin@umd.edu milad621 Milad Gholami mgholami@cs.umd.edu jeffyRao Jinfeng Rao jinfeng@cs.umd.edu org.sonatype.oss oss-parent 7 UTF-8 UTF-8 8.1.12.v20130726 2.6.0-cdh5.4.1 1.0.0-cdh5.4.1 3.4.5-cdh5.4.1 1.3.0-cdh5.4.1 2.10.4 maven-clean-plugin 2.6.1 src/main/solr/lib false org.apache.maven.plugins maven-compiler-plugin 3.2 1.7 1.7 org.apache.maven.plugins maven-shade-plugin 2.3 package shade META-INF/services/org.apache.lucene.codecs.Codec *:* META-INF/*.SF META-INF/*.DSA META-INF/*.RSA true fatjar org.apache.hadoop:* org.apache.maven.plugins maven-dependency-plugin 2.4 copy package copy-dependencies src/main/solr/lib org.codehaus.mojo appassembler-maven-plugin 1.9 -Xms512M -Xmx24576M org.warcbase.WarcbaseAdmin WarcbaseAdmin org.warcbase.data.UrlMappingBuilder UrlMappingBuilder org.warcbase.data.UrlMapping UrlMapping org.warcbase.data.ExtractLinks ExtractLinks org.warcbase.data.ExtractSiteLinks ExtractSiteLinks org.warcbase.ingest.IngestFiles IngestFiles org.warcbase.ingest.SearchForUrl SearchForUrl org.warcbase.browser.WarcBrowser WarcBrowser org.warcbase.analysis.DetectDuplicates DetectDuplicates org.warcbase.browser.SeleniumBrowser SeleniumBrowser org.scala-tools maven-scala-plugin 2.15.2 process-resources add-source compile scala-test-compile process-test-resources testCompile ${scala.version} true -target:jvm-1.7 -g:vars -deprecation -dependencyfile ${project.build.directory}/.scala_dependencies maven http://repo.maven.apache.org/maven2/ cloudera https://repository.cloudera.com/artifactory/cloudera-repos/ internetarchive Internet Archive Maven Repository http://builds.archive.org:8080/maven2 junit junit 4.12 test org.scalatest scalatest_2.10 2.2.4 test commons-codec commons-codec 1.8 commons-io commons-io 2.4 org.jsoup jsoup 1.7.3 com.google.guava guava 14.0.1 tl.lin lintools-datatypes 1.0.0 org.apache.hbase hbase-client ${hbase.version} org.apache.hadoophadoop-core org.apache.hbase hbase-server ${hbase.version} org.apache.hadoophadoop-core org.mortbay.jettyservlet-api-2.5 javax.servletservlet-api asmasm org.apache.hadoop hadoop-client ${hadoop.version} javax.servletservlet-api org.apache.zookeeper zookeeper ${zookeeper.version} org.netpreserve.openwayback openwayback-core 2.0.0.BETA.2 org.apache.hadoophadoop-core ch.qos.logbacklogback-classic org.netpreserve.openwaybackopenwayback-cdx-server org.netpreserve.openwaybackopenwayback-access-control-core it.unimi.dsidsiutils fastutilfastutil org.netpreserve.commons webarchive-commons 1.1.4 org.apache.hadoophadoop-core commons-langcommons-lang fastutilfastutil it.unimi.dsi dsiutils 2.2.0 ch.qos.logbacklogback-classic commons-langcommons-lang it.unimi.dsi fastutil 6.5.15 commons-langcommons-lang org.eclipse.jetty jetty-server ${jettyVersion} org.eclipse.jetty jetty-webapp ${jettyVersion} true org.slf4j slf4j-log4j12 1.6.4 org.apache.commons commons-lang3 3.0 commons-cli commons-cli 1.2 net.sf.opencsv opencsv 2.3 org.apache.tika tika-core 1.9 org.apache.tika tika-parsers 1.9 org.antlr antlr 3.5.2 org.seleniumhq.selenium selenium-java 2.42.2 org.seleniumhq.seleniumselenium-htmlunit-driver org.seleniumhq.seleniumselenium-ie-driver org.webbitserverwebbit org.scala-lang scala-library 2.10.4 org.apache.spark spark-core_2.10 ${spark.version} com.typesafeconfig org.xerial.snappysnappy-java com.chuusai shapeless_2.10.4 2.0.0 com.fasterxml.jackson.core jackson-core 2.6.3 - com.fasterxml.jackson.core jackson-databind 2.6.3 + + org.json4s + json4s-jackson_2.10 + 3.2.10 + + com.typesafe config 1.2.1 org.xerial.snappy snappy-java 1.0.5 edu.stanford.nlp stanford-corenlp 3.4.1 com.syncthemall boilerpipe 1.2.2 xerces xercesImpl 2.11.0 org.apache.lucene lucene-core 4.7.2 org.apache.solr solr-core 4.7.2 slf4j-apiorg.slf4j org.apache.hadoophadoop-annotations org.apache.hadoophadoop-common org.apache.hadoophadoop-hdfs com.typesafeconfig uk.bl.wa.discovery warc-hadoop-indexer 2.2.0-BETA-5 asmasm com.typesafeconfig diff --git a/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala b/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala index 1b54346..fabee06 100644 --- a/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala +++ b/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala @@ -1,45 +1,50 @@ /* * Warcbase: an open-source platform for managing web archives * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.warcbase.spark.matchbox import org.apache.hadoop.io.LongWritable import org.apache.spark.{SerializableWritable, SparkContext} import org.apache.spark.rdd.RDD +import org.json4s._ +import org.json4s.jackson.JsonMethods._ import org.warcbase.io.GenericArchiveRecordWritable.ArchiveFormat import org.warcbase.io.{GenericArchiveRecordWritable, WarcRecordWritable, ArcRecordWritable} import org.warcbase.mapreduce.{WacGenericInputFormat, WacWarcInputFormat, WacArcInputFormat} import org.warcbase.spark.archive.io.{WarcRecord, ArcRecord, ArchiveRecord, GenericArchiveRecord} object RecordLoader { def loadArc(path: String, sc: SparkContext): RDD[ArchiveRecord] = { sc.newAPIHadoopFile(path, classOf[WacArcInputFormat], classOf[LongWritable], classOf[ArcRecordWritable]) .map(r => new ArcRecord(new SerializableWritable(r._2))) } def loadWarc(path: String, sc: SparkContext): RDD[ArchiveRecord] = { sc.newAPIHadoopFile(path, classOf[WacWarcInputFormat], classOf[LongWritable], classOf[WarcRecordWritable]) .filter(r => r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response")) .map(r => new WarcRecord(new SerializableWritable(r._2))) } def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = { sc.newAPIHadoopFile(path, classOf[WacGenericInputFormat], classOf[LongWritable], classOf[GenericArchiveRecordWritable]) .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) .map(r => new GenericArchiveRecord(new SerializableWritable(r._2))) } + + def loadTweets(path: String, sc: SparkContext): RDD[JValue] = + sc.textFile(path).filter(line => !line.startsWith("{\"delete\":")).map(line => parse(line)) } diff --git a/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala b/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala new file mode 100644 index 0000000..1b9d14e --- /dev/null +++ b/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala @@ -0,0 +1,20 @@ +package org.warcbase.spark.matchbox + +import org.json4s.JsonAST._ + +object TweetUtils { + implicit class JsonTweet(tweet: JValue) { + implicit lazy val formats = org.json4s.DefaultFormats + + def id(): String = (tweet \ "id_str").extract[String] + def createdAt(): String = (tweet \ "created_at").extract[String] + def text(): String = (tweet \ "text").extract[String] + def lang: String = (tweet \ "lang").extract[String] + + def username(): String = (tweet \ "user" \ "screen_name").extract[String] + def isVerifiedUser(): Boolean = (tweet \ "user" \ "screen_name").extract[String] == "false" + + def followerCount: Int = (tweet \ "user" \ "followers_count").extract[Int] + def friendCount: Int = (tweet \ "user" \ "friends_count").extract[Int] + } +} \ No newline at end of file