diff --git a/pom.xml b/pom.xml
index 6505e1a..17f9822 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,533 +1,538 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.warcbase</groupId>
   <artifactId>warcbase</artifactId>
   <packaging>jar</packaging>
   <version>0.1.0-SNAPSHOT</version>
   <name>Warcbase</name>
   <description>An open-source platform for managing web archives built on Hadoop and HBase</description>
   <url>http://warcbase.org/</url>
 
   <licenses>
     <license>
       <name>The Apache Software License, Version 2.0</name>
       <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
       <distribution>repo</distribution>
     </license>
   </licenses>
 
   <scm>
     <connection>scm:git:git@github.com:lintool/warcbase.git</connection>
     <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection>
     <url>git@github.com:lintool/warcbase.git</url>
   </scm>
 
   <developers>
     <developer>
       <id>lintool</id>
       <name>Jimmy Lin</name>
       <email>jimmylin@umd.edu</email>
     </developer>
     <developer>
       <id>milad621</id>
       <name>Milad Gholami</name>
       <email>mgholami@cs.umd.edu</email>
     </developer>
     <developer>
       <id>jeffyRao</id>
       <name>Jinfeng Rao</name>
       <email>jinfeng@cs.umd.edu</email>
     </developer>
   </developers>
 
   <parent>
     <groupId>org.sonatype.oss</groupId>
     <artifactId>oss-parent</artifactId>
     <version>7</version>
   </parent>
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
     <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
     <jettyVersion>8.1.12.v20130726</jettyVersion>
     <hadoop.version>2.6.0-cdh5.4.1</hadoop.version>
     <hbase.version>1.0.0-cdh5.4.1</hbase.version>
     <zookeeper.version>3.4.5-cdh5.4.1</zookeeper.version>
-    <spark.version>1.3.0-cdh5.4.1</spark.version>
+    <spark.version>1.3.0-cdh5.4.1</spark.version>    
     <scala.version>2.10.4</scala.version>
   </properties>
 
   <build>
     <plugins>
 
  <plugin>
     <artifactId>maven-clean-plugin</artifactId>
     <version>2.6.1</version>
     <configuration>
       <filesets>
         <fileset>
           <directory>src/main/solr/lib</directory>
           <followSymlinks>false</followSymlinks>
         </fileset>
       </filesets>
     </configuration>
   </plugin>
 
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
         <version>3.2</version>
         <configuration>
           <source>1.7</source>
           <target>1.7</target>
         </configuration>
       </plugin>
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
         <version>2.3</version>
         <executions>
           <execution>
             <phase>package</phase>
             <goals>
               <goal>shade</goal>
             </goals>
             <configuration>
 
 <!--
 http://mail-archives.apache.org/mod_mbox/lucene-java-user/201308.mbox/%3CWC20130822094206.310452@isped.u-bordeaux2.fr%3E
 -->
 
 <transformers><transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
     <resource>META-INF/services/org.apache.lucene.codecs.Codec</resource></transformer></transformers>
 
 
               <!-- This fixes the issue "Invalid signature file digest for Manifest main attributes"
                    cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html -->
               <filters>
                 <filter>
                   <artifact>*:*</artifact>
                   <excludes>
                     <exclude>META-INF/*.SF</exclude>
                     <exclude>META-INF/*.DSA</exclude>
                     <exclude>META-INF/*.RSA</exclude>
                   </excludes>
                 </filter>
               </filters>
               <!-- this will create both a normal thin jar and also a fatjar -->
               <shadedArtifactAttached>true</shadedArtifactAttached>
               <shadedClassifierName>fatjar</shadedClassifierName>
               <artifactSet>
                 <excludes>
                   <exclude>org.apache.hadoop:*</exclude>
                 </excludes>
               </artifactSet>
             </configuration>
           </execution>
         </executions>
       </plugin>
 
 <plugin>
     <groupId>org.apache.maven.plugins</groupId>
     <artifactId>maven-dependency-plugin</artifactId>
     <version>2.4</version>
     <executions>
         <execution>
             <id>copy</id>
             <phase>package</phase>
             <goals>
                 <goal>copy-dependencies</goal>
             </goals>
             <configuration>
                 <outputDirectory>
                     src/main/solr/lib
                 </outputDirectory>
             </configuration>
         </execution>
     </executions>
 </plugin>
 
       <plugin>
         <groupId>org.codehaus.mojo</groupId>
         <artifactId>appassembler-maven-plugin</artifactId>
         <version>1.9</version>
         <configuration>
           <extraJvmArguments>-Xms512M -Xmx24576M</extraJvmArguments>
           <programs>
             <program>
               <mainClass>org.warcbase.WarcbaseAdmin</mainClass>
               <name>WarcbaseAdmin</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.UrlMappingBuilder</mainClass>
               <name>UrlMappingBuilder</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.UrlMapping</mainClass>
               <name>UrlMapping</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.ExtractLinks</mainClass>
               <name>ExtractLinks</name>
             </program>
             <program>
               <mainClass>org.warcbase.data.ExtractSiteLinks</mainClass>
               <name>ExtractSiteLinks</name>
             </program>
             <program>
               <mainClass>org.warcbase.ingest.IngestFiles</mainClass>
               <name>IngestFiles</name>
             </program>
             <program>
               <mainClass>org.warcbase.ingest.SearchForUrl</mainClass>
               <name>SearchForUrl</name>
             </program>
             <program>
               <mainClass>org.warcbase.browser.WarcBrowser</mainClass>
               <name>WarcBrowser</name>
             </program>
             <program>
               <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass>
               <name>DetectDuplicates</name>
             </program>
             <program>
               <mainClass>org.warcbase.browser.SeleniumBrowser</mainClass>
               <name>SeleniumBrowser</name>
             </program>
           </programs>
         </configuration>
       </plugin>
       <!-- for Scala -->
       <plugin>
         <groupId>org.scala-tools</groupId>
         <artifactId>maven-scala-plugin</artifactId>
         <version>2.15.2</version>
         <executions>
           <execution>
             <phase>process-resources</phase>
             <goals>
               <goal>add-source</goal>
               <goal>compile</goal>
             </goals>
           </execution>
           <execution>
             <id>scala-test-compile</id>
             <phase>process-test-resources</phase>
             <goals>
               <goal>testCompile</goal>
             </goals>
           </execution>
         </executions>
         <configuration>
           <scalaVersion>${scala.version}</scalaVersion>
           <sendJavaToScalac>true</sendJavaToScalac>
           <args>
             <arg>-target:jvm-1.7</arg>
             <arg>-g:vars</arg>
             <arg>-deprecation</arg>
             <arg>-dependencyfile</arg>
             <arg>${project.build.directory}/.scala_dependencies</arg>
           </args>
         </configuration>
       </plugin>
     </plugins>
   </build>
 
   <repositories>
     <repository>
       <id>maven</id>
       <url>http://repo.maven.apache.org/maven2/</url>
     </repository>
     <repository>
       <id>cloudera</id>
       <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
     </repository>
     <repository>
       <id>internetarchive</id>
       <name>Internet Archive Maven Repository</name>
       <url>http://builds.archive.org:8080/maven2</url>
     </repository>
   </repositories>
 
   <dependencies>
     <dependency>
       <groupId>junit</groupId>
       <artifactId>junit</artifactId>
       <version>4.12</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.scalatest</groupId>
       <artifactId>scalatest_2.10</artifactId>
       <version>2.2.4</version>
       <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>commons-codec</groupId>
       <artifactId>commons-codec</artifactId>
       <version>1.8</version>
     </dependency>
     <dependency>
       <groupId>commons-io</groupId>
       <artifactId>commons-io</artifactId>
       <version>2.4</version>
     </dependency>
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
       <version>1.7.3</version>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
-      <version>14.0.1</version> <!-- downgrade for Hadoop WARC indexer -->
+      <version>19.0</version>
     </dependency>
     <dependency>
       <groupId>tl.lin</groupId>
       <artifactId>lintools-datatypes</artifactId>
       <version>1.0.0</version>
     </dependency>
 
     <!-- Begin: Hadoop-related dependencies -->
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-client</artifactId>
       <version>${hbase.version}</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-server</artifactId>
       <version>${hbase.version}</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>org.mortbay.jetty</groupId><artifactId>servlet-api-2.5</artifactId></exclusion>
         <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion>
         <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <!-- See http://www.cloudera.com/content/cloudera-content/cloudera-docs/CDH5/latest/CDH-Version-and-Packaging-Information/cdhvd_hadoop_api_dependencies.html -->
     <dependency>
       <groupId>org.apache.hadoop</groupId>
       <artifactId>hadoop-client</artifactId>
       <version>${hadoop.version}</version>
       <exclusions>
         <exclusion><groupId>javax.servlet</groupId><artifactId>servlet-api</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.apache.zookeeper</groupId>
       <artifactId>zookeeper</artifactId>
       <version>${zookeeper.version}</version>
     </dependency>
 
     <!-- End: Hadoop-related dependencies -->
 
     <dependency>
       <groupId>org.netpreserve.openwayback</groupId>
       <artifactId>openwayback-core</artifactId>
       <version>2.0.0.BETA.2</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion>
         <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-cdx-server</artifactId></exclusion>
         <exclusion><groupId>org.netpreserve.openwayback</groupId><artifactId>openwayback-access-control-core</artifactId></exclusion>
         <!-- swap in our own latest versions -->
         <exclusion><groupId>it.unimi.dsi</groupId><artifactId>dsiutils</artifactId></exclusion>
         <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>org.netpreserve.commons</groupId>
       <artifactId>webarchive-commons</artifactId>
       <version>1.1.4</version>
       <exclusions>
         <exclusion><groupId>org.apache.hadoop</groupId><artifactId>hadoop-core</artifactId></exclusion>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
         <exclusion><groupId>fastutil</groupId><artifactId>fastutil</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>it.unimi.dsi</groupId>
       <artifactId>dsiutils</artifactId>
       <version>2.2.0</version>
       <exclusions>
         <exclusion><groupId>ch.qos.logback</groupId><artifactId>logback-classic</artifactId></exclusion>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
       </exclusions>
     </dependency>
     <dependency>
       <groupId>it.unimi.dsi</groupId>
       <artifactId>fastutil</artifactId>
       <version>6.5.15</version>
       <exclusions>
         <exclusion><groupId>commons-lang</groupId><artifactId>commons-lang</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
       <version>${jettyVersion}</version>
     </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-webapp</artifactId>
       <version>${jettyVersion}</version>
       <optional>true</optional>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>
       <artifactId>slf4j-log4j12</artifactId>
       <version>1.6.4</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
       <version>3.0</version>
     </dependency>
     <dependency>
       <groupId>commons-cli</groupId>
       <artifactId>commons-cli</artifactId>
       <version>1.2</version>
     </dependency>
 
     <dependency>
       <groupId>net.sf.opencsv</groupId>
       <artifactId>opencsv</artifactId>
       <version>2.3</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
       <version>1.9</version>
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-parsers</artifactId>
       <version>1.9</version>
     </dependency>
 
     <dependency>
       <groupId>org.antlr</groupId>
       <artifactId>antlr</artifactId>
       <version>3.5.2</version>
     </dependency>
 
     <dependency>
       <groupId>org.seleniumhq.selenium</groupId>
       <artifactId>selenium-java</artifactId>
       <version>2.42.2</version>
       <exclusions>
         <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-htmlunit-driver</artifactId></exclusion>
         <exclusion><groupId>org.seleniumhq.selenium</groupId><artifactId>selenium-ie-driver</artifactId></exclusion>
         <exclusion><groupId>org.webbitserver</groupId><artifactId>webbit</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>org.scala-lang</groupId>
       <artifactId>scala-library</artifactId>
       <version>2.10.4</version>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.10</artifactId>
       <version>${spark.version}</version>
       <exclusions>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
         <exclusion><groupId>org.xerial.snappy</groupId><artifactId>snappy-java</artifactId>
       </exclusion>
       </exclusions>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-graphx_2.10</artifactId>
+      <version>${spark.version}</version>
+    </dependency>
 
     <dependency>
       <groupId>com.chuusai</groupId>
       <artifactId>shapeless_2.10.4</artifactId>
       <version>2.0.0</version>
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-core</artifactId>
-      <version>2.6.3</version>
+      <version>2.7.2</version>
     </dependency>
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
-      <version>2.6.3</version>
+      <version>2.7.2</version>
     </dependency>
     <dependency>
       <groupId>org.json4s</groupId>
       <artifactId>json4s-jackson_2.10</artifactId>
       <version>3.2.10</version>
       <!-- see this issue: http://stackoverflow.com/questions/32400061/spark-streaming-json4s-jackson-dependency-problems -->
     </dependency>
 
     <dependency>
       <groupId>com.typesafe</groupId>
       <artifactId>config</artifactId>
       <version>1.2.1</version>
     </dependency>
 
     <dependency>
       <groupId>org.xerial.snappy</groupId>
       <artifactId>snappy-java</artifactId>
       <version>1.0.5</version>
     </dependency>
 
     <dependency>
       <groupId>edu.stanford.nlp</groupId>
       <artifactId>stanford-corenlp</artifactId>
       <version>3.4.1</version>
     </dependency>
 
     <dependency>
       <groupId>com.syncthemall</groupId>
       <artifactId>boilerpipe</artifactId>
       <version>1.2.2</version>
     </dependency>
 
     <dependency>
       <groupId>xerces</groupId>
       <artifactId>xercesImpl</artifactId>
       <version>2.11.0</version>
     </dependency>
 
     <dependency>
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-core</artifactId>
       <version>4.7.2</version>
     </dependency>
     <dependency>
       <groupId>org.apache.solr</groupId>
       <artifactId>solr-core</artifactId>
       <version>4.7.2</version>
       <exclusions>
         <exclusion><artifactId>slf4j-api</artifactId><groupId>org.slf4j</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-annotations</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-common</groupId></exclusion>
         <exclusion><artifactId>org.apache.hadoop</artifactId><groupId>hadoop-hdfs</groupId></exclusion>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
       </exclusions>
     </dependency>
 
     <dependency>
       <groupId>uk.bl.wa.discovery</groupId>
       <artifactId>warc-hadoop-indexer</artifactId>
       <version>2.2.0-BETA-5</version>
       <exclusions>
         <exclusion><groupId>asm</groupId><artifactId>asm</artifactId></exclusion>
         <exclusion><groupId>com.typesafe</groupId><artifactId>config</artifactId></exclusion>
       </exclusions>
     </dependency>
 
   </dependencies>
 </project>
diff --git a/src/main/scala/org/warcbase/spark/matchbox/ExtractGraph.scala b/src/main/scala/org/warcbase/spark/matchbox/ExtractGraph.scala
new file mode 100644
index 0000000..a14cbaf
--- /dev/null
+++ b/src/main/scala/org/warcbase/spark/matchbox/ExtractGraph.scala
@@ -0,0 +1,89 @@
+/*
+ * Warcbase: an open-source platform for managing web archives
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.warcbase.spark.matchbox
+
+import org.apache.spark.graphx._
+import org.apache.spark.rdd.RDD
+import org.warcbase.spark.archive.io.ArchiveRecord
+import org.warcbase.spark.rdd.RecordRDD._
+import org.warcbase.spark.utils.JsonUtil
+
+/**
+  *
+  * e.g. when done:
+  * $ cat nodes.partjson/part-* > nodes.json && cat links.partjson/part-* > links.json
+  * $ jq -c -n --slurpfile nodes nodes.json --slurpfile links links.json '{nodes: $nodes, links: $links}' > graph.json
+  *
+  */
+
+object ExtractGraph {
+  def pageHash(url: String): VertexId = {
+    url.hashCode.toLong
+  }
+
+  case class VertexData(domain: String, pageRank: Double, inDegree: Int, outDegree: Int)
+  case class EdgeData(date: String, src: String, dst: String)
+
+  def apply(records: RDD[ArchiveRecord], dynamic: Boolean = false,
+            tolerance: Double = 0.001, numIter: Int = 3): Graph[VertexData, EdgeData] = {
+    val vertices: RDD[(VertexId, VertexData)] = records.keepValidPages()
+      .flatMap(r => ExtractLinks(r.getUrl, r.getContentString))
+      .flatMap(r => List(ExtractTopLevelDomain(r._1).replaceAll("^\\s*www\\.", ""), ExtractTopLevelDomain(r._2).replaceAll("^\\s*www\\.", "")))
+      .distinct
+      .map(r => (pageHash(r), VertexData(r, 0.0, 0, 0)))
+
+    val edges: RDD[Edge[EdgeData]] = records.keepValidPages()
+      .map(r => (r.getCrawldate, ExtractLinks(r.getUrl, r.getContentString)))
+      .flatMap(r => r._2.map(f => (r._1, ExtractTopLevelDomain(f._1).replaceAll("^\\s*www\\.", ""), ExtractTopLevelDomain(f._2).replaceAll("^\\s*www\\.", ""))))
+      .filter(r => r._2 != "" && r._3 != "")
+      .map(r => Edge(pageHash(r._2), pageHash(r._3), EdgeData(r._1, r._2, r._3)))
+
+    val graph = Graph(vertices, edges)
+
+    val graphInOut = graph.outerJoinVertices(graph.inDegrees) {
+      case (vid, rv, inDegOpt) => VertexData(rv.domain, rv.pageRank, inDegOpt.getOrElse(0), rv.outDegree)
+    }.outerJoinVertices(graph.outDegrees) {
+      case (vid, rv, outDegOpt) => VertexData(rv.domain, rv.pageRank, rv.inDegree, outDegOpt.getOrElse(0))
+    }
+
+    if (dynamic) {
+      graphInOut.outerJoinVertices(graph.pageRank(tolerance).vertices) {
+        case (vid, rv, pageRankOpt) => VertexData(rv.domain, pageRankOpt.getOrElse(0.0), rv.inDegree, rv.outDegree)
+      }
+    } else {
+      graphInOut.outerJoinVertices(graph.staticPageRank(numIter).vertices) {
+        case (vid, rv, pageRankOpt) => VertexData(rv.domain, pageRankOpt.getOrElse(0.0), rv.inDegree, rv.outDegree)
+      }
+    }
+  }
+
+  implicit class GraphWriter(graph: Graph[VertexData, EdgeData]) {
+    def writeAsJson(verticesPath: String, edgesPath: String) = {
+      // Combine edges of a given (date, src, dst) combination into single record with count value.
+      val edgesCounted = graph.edges.countItems().map {
+        r => Map("date" -> r._1.attr.date,
+          "src" -> r._1.attr.src,
+          "dst" -> r._1.attr.dst,
+          "count" -> r._2)
+      }
+
+      edgesCounted.map(r => JsonUtil.toJson(r)).saveAsTextFile(edgesPath)
+      graph.vertices.map(r => JsonUtil.toJson(r._2)).saveAsTextFile(verticesPath)
+    }
+  }
+}
+
diff --git a/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala b/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala
index c7421e2..180012a 100644
--- a/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala
+++ b/src/main/scala/org/warcbase/spark/matchbox/NERCombinedJson.scala
@@ -1,149 +1,129 @@
 package org.warcbase.spark.matchbox
 
 import java.io.BufferedReader
 import java.io.BufferedWriter
 import java.io.InputStreamReader
 import java.io.OutputStreamWriter
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs._
-import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
-import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
-import com.fasterxml.jackson.module.scala.DefaultScalaModule
-import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
-
+import org.warcbase.spark.utils.JsonUtil
 import scala.collection.mutable.MutableList
 import scala.util.Random
 
 /**
   * Classifies records using NER and stores results as JSON
   */
 
 class NERCombinedJson extends Serializable {
   def combineKeyCountLists (l1: List[(String, Int)], l2: List[(String, Int)]): List[(String, Int)] = {
     (l1 ++ l2).groupBy(_._1 ).map {
       case (key, tuples) => (key, tuples.map( _._2).sum) 
     }.toList
   }
 
   /** Combines directory of part-files containing one JSON array per line
     * into a single file containing a single JSON array of arrays.
     *
     * @param srcDir name of directory holding files, also name that will
     *               be given to JSON file.
     */
   def partDirToFile(srcDir: String): Unit = {
     val hadoopConfig = new Configuration()
     val hdfs = FileSystem.get(hadoopConfig)
     val rnd = new Random
     
     val srcPath = new Path(srcDir)
     val tmpFile = rnd.alphanumeric.take(8).mkString + ".almostjson"
     val tmpPath = new Path(tmpFile)
 
     // Merge part-files into single file
     FileUtil.copyMerge(hdfs, srcPath, hdfs, tmpPath, false, hadoopConfig, null)
 
     // Read file of JSON arrays, write into single JSON array of arrays
     val fsInStream = hdfs.open(tmpPath)
     val inFile = new BufferedReader(new InputStreamReader(fsInStream))
     hdfs.delete(srcPath, true)  // Don't need part-files anymore
     val fsOutStream = hdfs.create(srcPath, true) // path was dir of part-files,
                                                  // now is a file of JSON
     val outFile = new BufferedWriter(new OutputStreamWriter(fsOutStream))
     outFile.write("[")
     val line = inFile.readLine()
     if (line != null) outFile.write(line)    
     Iterator.continually(inFile.readLine()).takeWhile(_ != null).foreach(s => {outFile.write(", " + s)})
     outFile.write("]")
     outFile.close()
     
     inFile.close()
     hdfs.delete(tmpPath, false)
   }
 
   /** Do NER classification on input path, output JSON.
     *
     * @param iNerClassifierFile path of classifier file
     * @param inputFile path of file with tuples (date: String, url: String, content: String)
     *                  from which to extract entities
     * @param outputFile path of output file (e.g., "entities.json")
     * @param sc Spark context object
     */
   def classify(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext) {
     val out = sc.textFile(inputFile)
       .mapPartitions(iter => {
         NER3Classifier.apply(iNerClassifierFile)
         iter.map(line => {
             val ind1 = line.indexOf(",")
             val ind2 = line.indexOf(",", ind1 + 1)
             (line.substring(1, ind1),
             line.substring(ind1 + 1, ind2),
             line.substring(ind2 + 1, line.length - 1))
           })
           .map(r => {
             val classifiedJson = NER3Classifier.classify(r._3)
-            val jUtl = new JsonUtil
-            val classifiedMap = jUtl.fromJson[Map[String,List[String]]](classifiedJson)
+            //val jUtl = new JsonUtil
+            //val classifiedMap = JsonUtil.fromJson[Map[String,List[String]]](classifiedJson)
+            val classifiedMap = JsonUtil.fromJson(classifiedJson)
             val classifiedMapCountTuples: Map[String, List[(String, Int)]] = classifiedMap.map {
-              case (nerType, entityList) => (nerType, entityList.groupBy(identity).mapValues(_.size).toList)
+              case (nerType, entityList: List[String]) => (nerType, entityList.groupBy(identity).mapValues(_.size).toList)
             }
             ((r._1, r._2), classifiedMapCountTuples)
           })
       })
       .reduceByKey( (a, b) => (a ++ b).keySet.map(r => (r, combineKeyCountLists(a(r), b(r)))).toMap)
       .mapPartitions(iter => {
-        val jUtl = new JsonUtil
         iter.map(r => {
           val nerRec = new NerRecord(r._1._1, r._1._2)
           r._2.foreach(entityMap => {  
             // e.g., entityMap = "PERSON" -> List(("Jack", 1), ("Diane", 3))
             val ec = new EntityCounts(entityMap._1)    
             entityMap._2.foreach(e => {
               ec.entities += new Entity(e._1, e._2)
             })
             nerRec.ner += ec
           })
-          jUtl.toJson(nerRec)
+          JsonUtil.toJson(nerRec)
         })
       })
       .saveAsTextFile(outputFile)
 
     partDirToFile(outputFile)
   }
 
   class Entity(iEntity: String, iFreq: Int) {
     var entity: String = iEntity
     var freq: Int = iFreq
   }
 
   class EntityCounts(iNerType: String) { 
     var nerType: String = iNerType
     var entities = MutableList[Entity]()
   }
 
   class NerRecord(recDate: String, recDomain: String) {
     var date = recDate
     var domain = recDomain
 
     var ner = MutableList[EntityCounts]()
   }
 }
 
-class JsonUtil extends Serializable {
-  val mapper = new ObjectMapper() with ScalaObjectMapper
-  mapper.registerModule(DefaultScalaModule)
-  mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
-
-  def toJson(value: Map[Symbol, Any]): String = {
-    toJson(value map { case (k,v) => k.name -> v})
-  }
 
-  def toJson(value: Any): String = {
-    mapper.writeValueAsString(value)
-  }
-
-  def fromJson[T](json: String)(implicit m : Manifest[T]): T = {
-    mapper.readValue[T](json)
-  }
-}
diff --git a/src/main/scala/org/warcbase/spark/utils/JsonUtil.scala b/src/main/scala/org/warcbase/spark/utils/JsonUtil.scala
new file mode 100644
index 0000000..cf8b0d9
--- /dev/null
+++ b/src/main/scala/org/warcbase/spark/utils/JsonUtil.scala
@@ -0,0 +1,22 @@
+package org.warcbase.spark.utils
+
+import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper}
+import com.fasterxml.jackson.module.scala.DefaultScalaModule
+
+object JsonUtil extends Serializable {
+  val mapper = new ObjectMapper()
+  mapper.registerModule(DefaultScalaModule)
+  mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+
+  def toJson(value: Map[Symbol, Any]): String = {
+    toJson(value map { case (k,v) => k.name -> v})
+  }
+
+  def toJson(value: Any): String = {
+    mapper.writeValueAsString(value)
+  }
+
+  def fromJson(json: String): Map[String, Any] = {
+    mapper.readValue(json, classOf[Map[String, Any]])
+  }
+}