diff --git a/pom.xml b/pom.xml
index 4d56742..6505e1a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,528 +1,533 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.warcbase</groupId>
   <artifactId>warcbase</artifactId>
   <packaging>jar</packaging>
   <version>0.1.0-SNAPSHOT</version>
   <name>Warcbase</name>
   <description>An open-source platform for managing web archives built on Hadoop and HBase</description>
   <url>http://warcbase.org/</url>
 
   <licenses>
     <license>
       <name>The Apache Software License, Version 2.0</name>
       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
       <distribution>repo</distribution>
     </license>
   </licenses>
 
   <scm>
     <connection>scm:git:git@github.com:lintool/warcbase.git</connection>
     <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection>
     <url>git@github.com:lintool/warcbase.git</url>
   </scm>
 
   <developers>
     <developer>
       <id>lintool</id>
       <name>Jimmy Lin</name>
       <email>jimmylin@umd.edu</email>
     </developer>
     <developer>
       <id>milad621</id>
       <name>Milad Gholami</name>
       <email>mgholami@cs.umd.edu</email>
     </developer>
     <developer>
       <id>jeffyRao</id>
       <name>Jinfeng Rao</name>
       <email>jinfeng@cs.umd.edu</email>
     </developer>
   </developers>
 
   <parent>
     <groupId>org.sonatype.oss</groupId>
     <artifactId>oss-parent</artifactId>
     <version>7</version>
   </parent>
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
     <jettyVersion>8.1.12.v20130726</jettyVersion>
     <hadoop.version>2.6.0-cdh5.4.1</hadoop.version>
     <hbase.version>1.0.0-cdh5.4.1</hbase.version>
     <zookeeper.version>3.4.5-cdh5.4.1</zookeeper.version>
     <spark.version>1.3.0-cdh5.4.1</spark.version>
     <scala.version>2.10.4</scala.version>
   </properties>
 
   <build>
     <plugins>
 
  <plugin>
     <artifactId>maven-clean-plugin</artifactId>
     <version>2.6.1</version>
     <configuration>
       <filesets>
         <fileset>
           <directory>src/main/solr/lib</directory>
           <followSymlinks>false</followSymlinks>
         </fileset>
       </filesets>
     </configuration>
   </plugin>
 
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.2</version>
         <configuration>
           <source>1.7</source>
           <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
         <version>2.3</version>
         <executions>
           <execution>
             <phase>package</phase>
             <goals>
               <goal>shade</goal>
             </goals>
             <configuration>
 
 <!--
 http://mail-archives.apache.org/mod_mbox/lucene-java-user/201308.mbox/%3CWC20130822094206.310452@isped.u-bordeaux2.fr%3E
 -->
 
 <transformers><transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
     <resource>META-INF/services/org.apache.lucene.codecs.Codec</resource></transformer></transformers>
 
 
               <!-- This fixes the issue "Invalid signature file digest for Manifest main attributes"
                    cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html -->
               <filters>
                 <filter>
                   <artifact>*:*</artifact>
                   <excludes>
                     <exclude>META-INF/*.SF</exclude>
                     <exclude>META-INF/*.DSA</exclude>
                     <exclude>META-INF/*.RSA</exclude>
                   </excludes>
                 </filter>
               </filters>
               <!-- this will create both a normal thin jar and also a fatjar -->
               <shadedArtifactAttached>true</shadedArtifactAttached>
               <shadedClassifierName>fatjar</shadedClassifierName>
               <artifactSet>
                 <excludes>
                   <exclude>org.apache.hadoop:*</exclude>
                 </excludes>
               </artifactSet>
             </configuration>
           </execution>
         </executions>
       </plugin>
 
 <plugin>
     <groupId>org.apache.maven.plugins</groupId>
     <artifactId>maven-dependency-plugin</artifactId>
     <version>2.4</version>
     <executions>
         <execution>
             <id>copy</id>
             <phase>package</phase>
             <goals>
                 <goal>copy-dependencies</goal>
             </goals>
             <configuration>
                 <outputDirectory>
                     src/main/solr/lib
                 </outputDirectory>
             </configuration>
         </execution>
     </executions>
 </plugin>
 
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
         <artifactId>appassembler-maven-plugin</artifactId>
         <version>1.9</version>
         <configuration>
           <extraJvmArguments>-Xms512M -Xmx24576M</extraJvmArguments>
           <programs>
             <program>
               <mainClass>org.warcbase.WarcbaseAdmin</mainClass>
               <name>WarcbaseAdmin</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.UrlMappingBuilder</mainClass>
               <name>UrlMappingBuilder</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.UrlMapping</mainClass>
               <name>UrlMapping</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.ExtractLinks</mainClass>
               <name>ExtractLinks</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.ExtractSiteLinks</mainClass>
               <name>ExtractSiteLinks</name>
             </program>
             <program>
               <mainClass>org.warcbase.ingest.IngestFiles</mainClass>
               <name>IngestFiles</name>
             </program>
             <program>
               <mainClass>org.warcbase.ingest.SearchForUrl</mainClass>
               <name>SearchForUrl</name>
             </program>
             <program>
               <mainClass>org.warcbase.browser.WarcBrowser</mainClass>
               <name>WarcBrowser</name>
             </program>
             <program>
               <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass>
               <name>DetectDuplicates</name>
             </program>
             <program>
               <mainClass>org.warcbase.browser.SeleniumBrowser</mainClass>
               <name>SeleniumBrowser</name>
             </program>
           </programs>
         </configuration>
       </plugin>
       <!-- for Scala -->
       <plugin>
         <groupId>org.scala-tools</groupId>
         <artifactId>maven-scala-plugin</artifactId>
         <version>2.15.2</version>
         <executions>
           <execution>
             <phase>process-resources</phase>
             <goals>
               <goal>add-source</goal>
               <goal>compile</goal>
             </goals>
           </execution>
           <execution>
             <id>scala-test-compile</id>
             <phase>process-test-resources</phase>
             <goals>
               <goal>testCompile</goal>
             </goals>
           </execution>
         </executions>
         <configuration>
           <scalaVersion>${scala.version}</scalaVersion>
           <sendJavaToScalac>true</sendJavaToScalac>
           <args>
             <arg>-target:jvm-1.7</arg>
             <arg>-g:vars</arg>
             <arg>-deprecation</arg>
             <arg>-dependencyfile</arg>
             <arg>${project.build.directory}/.scala_dependencies</arg>
           </args>
         </configuration>
       </plugin>
     </plugins>
   </build>
 
   <repositories>
     <repository>
       <id>maven</id>
       <url>http://repo.maven.apache.org/maven2/</url>
     </repository>
     <repository>
       <id>cloudera</id>
       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
     </repository>
     <repository>
       <id>internetarchive</id>
       <name>Internet Archive Maven Repository</name>
       <url>http://builds.archive.org:8080/maven2</url>
     </repository>
   </repositories>
 
   <dependencies>
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>4.12</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_2.10</artifactId>
       <version>2.2.4</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
       <version>1.8</version>
     </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
       <version>2.4</version>
     </dependency>
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
       <version>1.7.3</version>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
       <version>14.0.1</version> <!-- downgrade for Hadoop WARC indexer -->
     </dependency>
     <dependency>
       <groupId>tl.lin</groupId>
       <artifactId>lintools-datatypes</artifactId>
       <version>1.0.0</version>
     </dependency>
 
     <!-- Begin: Hadoop-related dependencies -->
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-client</artifactId>
       <version>${hbase.version}</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-server</artifactId>
       <version>${hbase.version}</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>org.mortbay.jetty</groupId><artifactId>servlet-api-2.5</artifactId></exclusion>
         <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion>
         <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <!-- See http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH-Version-and-Packaging-Information/cdhvd_hadoop_api_dependencies.html -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <exclusions>
         <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.apache.zookeeper</groupId>
       <artifactId>zookeeper</artifactId>
       <version>${zookeeper.version}</version>
     </dependency>
 
     <!-- End: Hadoop-related dependencies -->
 
     <dependency>
       <groupId>org.netpreserve.openwayback</groupId>
       <artifactId>openwayback-core</artifactId>
       <version>2.0.0.BETA.2</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion>
         <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-cdx-server</artifactId></exclusion>
         <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-access-control-core</artifactId></exclusion>
         <!-- swap in our own latest versions -->
         <exclusion><groupId>it.unimi.dsi</groupId><artifactId>dsiutils</artifactId></exclusion>
         <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>org.netpreserve.commons</groupId>
       <artifactId>webarchive-commons</artifactId>
       <version>1.1.4</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
         <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>it.unimi.dsi</groupId>
       <artifactId>dsiutils</artifactId>
       <version>2.2.0</version>
       <exclusions>
         <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>it.unimi.dsi</groupId>
       <artifactId>fastutil</artifactId>
       <version>6.5.15</version>
       <exclusions>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
       <version>${jettyVersion}</version>
     </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-webapp</artifactId>
       <version>${jettyVersion}</version>
       <optional>true</optional>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-log4j12</artifactId>
       <version>1.6.4</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
       <version>3.0</version>
     </dependency>
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
       <version>1.2</version>
     </dependency>
 
     <dependency>
       <groupId>net.sf.opencsv</groupId>
       <artifactId>opencsv</artifactId>
       <version>2.3</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
       <version>1.9</version>
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-parsers</artifactId>
       <version>1.9</version>
     </dependency>
 
     <dependency>
       <groupId>org.antlr</groupId>
       <artifactId>antlr</artifactId>
       <version>3.5.2</version>
     </dependency>
 
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
       <version>2.42.2</version>
       <exclusions>
         <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-htmlunit-driver</artifactId></exclusion>
         <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-ie-driver</artifactId></exclusion>
         <exclusion><groupId>org.webbitserver</groupId><artifactId>webbit</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
       <version>2.10.4</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.10</artifactId>
       <version>${spark.version}</version>
       <exclusions>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
         <exclusion><groupId>org.xerial.snappy</groupId><artifactId>snappy-java</artifactId>
       </exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>com.chuusai</groupId>
       <artifactId>shapeless_2.10.4</artifactId>
       <version>2.0.0</version>
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-core</artifactId>
       <version>2.6.3</version>
     </dependency>
-
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
       <version>2.6.3</version>
     </dependency>
+    <dependency>
+      <groupId>org.json4s</groupId>
+      <artifactId>json4s-jackson_2.10</artifactId>
+      <version>3.2.10</version>
+      <!-- see this issue: http://stackoverflow.com/questions/32400061/spark-streaming-json4s-jackson-dependency-problems -->
+    </dependency>
 
     <dependency>
       <groupId>com.typesafe</groupId>
       <artifactId>config</artifactId>
       <version>1.2.1</version>
     </dependency>
 
     <dependency>
       <groupId>org.xerial.snappy</groupId>
       <artifactId>snappy-java</artifactId>
       <version>1.0.5</version>
     </dependency>
 
     <dependency>
       <groupId>edu.stanford.nlp</groupId>
       <artifactId>stanford-corenlp</artifactId>
       <version>3.4.1</version>
     </dependency>
 
     <dependency>
       <groupId>com.syncthemall</groupId>
       <artifactId>boilerpipe</artifactId>
       <version>1.2.2</version>
     </dependency>
 
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
       <version>2.11.0</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-core</artifactId>
       <version>4.7.2</version>
     </dependency>
     <dependency>
       <groupId>org.apache.solr</groupId>
       <artifactId>solr-core</artifactId>
       <version>4.7.2</version>
       <exclusions>
         <exclusion><artifactId>slf4j-api</artifactId><groupId>org.slf4j</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-annotations</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-common</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-hdfs</groupId></exclusion>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>uk.bl.wa.discovery</groupId>
       <artifactId>warc-hadoop-indexer</artifactId>
       <version>2.2.0-BETA-5</version>
       <exclusions>
         <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
       </exclusions>
     </dependency>
 
   </dependencies>
 </project>
diff --git a/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala b/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala
index 1b54346..fabee06 100644
--- a/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala
+++ b/src/main/scala/org/warcbase/spark/matchbox/RecordLoader.scala
@@ -1,45 +1,50 @@
 /*
  * Warcbase: an open-source platform for managing web archives
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.warcbase.spark.matchbox
 
 import org.apache.hadoop.io.LongWritable
 import org.apache.spark.{SerializableWritable, SparkContext}
 import org.apache.spark.rdd.RDD
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
 import org.warcbase.io.GenericArchiveRecordWritable.ArchiveFormat
 import org.warcbase.io.{GenericArchiveRecordWritable, WarcRecordWritable, ArcRecordWritable}
 import org.warcbase.mapreduce.{WacGenericInputFormat, WacWarcInputFormat, WacArcInputFormat}
 import org.warcbase.spark.archive.io.{WarcRecord, ArcRecord, ArchiveRecord, GenericArchiveRecord}
 
 object RecordLoader {
   def loadArc(path: String, sc: SparkContext): RDD[ArchiveRecord] = {
     sc.newAPIHadoopFile(path, classOf[WacArcInputFormat], classOf[LongWritable], classOf[ArcRecordWritable])
       .map(r => new ArcRecord(new SerializableWritable(r._2)))
   }
 
   def loadWarc(path: String, sc: SparkContext): RDD[ArchiveRecord] = {
     sc.newAPIHadoopFile(path, classOf[WacWarcInputFormat], classOf[LongWritable], classOf[WarcRecordWritable])
       .filter(r => r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))
       .map(r => new WarcRecord(new SerializableWritable(r._2)))
   }
 
   def loadArchives(path: String, sc: SparkContext): RDD[ArchiveRecord] = {
     sc.newAPIHadoopFile(path, classOf[WacGenericInputFormat], classOf[LongWritable], classOf[GenericArchiveRecordWritable])
       .filter(r => (r._2.getFormat == ArchiveFormat.ARC) ||
         ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response")))
       .map(r => new GenericArchiveRecord(new SerializableWritable(r._2)))
   }
+
+  def loadTweets(path: String, sc: SparkContext): RDD[JValue] =
+    sc.textFile(path).filter(line => !line.startsWith("{\"delete\":")).map(line => parse(line))
 }
diff --git a/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala b/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala
new file mode 100644
index 0000000..1b9d14e
--- /dev/null
+++ b/src/main/scala/org/warcbase/spark/matchbox/TweetUtils.scala
@@ -0,0 +1,20 @@
+package org.warcbase.spark.matchbox
+
+import org.json4s.JsonAST._
+
+object TweetUtils {
+  implicit class JsonTweet(tweet: JValue) {
+    implicit lazy val formats = org.json4s.DefaultFormats
+
+    def id(): String = (tweet \ "id_str").extract[String]
+    def createdAt(): String = (tweet \ "created_at").extract[String]
+    def text(): String = (tweet \ "text").extract[String]
+    def lang: String = (tweet \ "lang").extract[String]
+
+    def username(): String = (tweet \ "user" \ "screen_name").extract[String]
+    def isVerifiedUser(): Boolean = (tweet \ "user" \ "screen_name").extract[String] == "false"
+
+    def followerCount: Int = (tweet \ "user" \ "followers_count").extract[Int]
+    def friendCount: Int = (tweet \ "user" \ "friends_count").extract[Int]
+  }
+}
\ No newline at end of file