diff --git a/pom.xml b/pom.xml
index 136e0d5..ec3448c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,337 +1,359 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 
-  <modelVersion>4.0.0</modelVersion>
-  <groupId>org.warcbase</groupId>
-  <artifactId>warcbase</artifactId>
-  <packaging>jar</packaging>
-  <version>0.1.0-SNAPSHOT</version>
-  <name>warcbase</name>
-  <description>WARC + HBase</description>
-  <url>http://warcbase.org/</url>
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>org.warcbase</groupId>
+    <artifactId>warcbase</artifactId>
+    <packaging>jar</packaging>
+    <version>0.1.0-SNAPSHOT</version>
+    <name>warcbase</name>
+    <description>WARC + HBase</description>
+    <url>http://warcbase.org/</url>
 
-  <licenses>
-    <license>
-      <name>The Apache Software License, Version 2.0</name>
-      <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
-      <distribution>repo</distribution>
-    </license>
-  </licenses>
+    <licenses>
+        <license>
+            <name>The Apache Software License, Version 2.0</name>
+            <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
+            <distribution>repo</distribution>
+        </license>
+    </licenses>
 
-  <scm>
-    <connection>scm:git:git@github.com:lintool/warcbase.git</connection>
-    <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection>
-    <url>git@github.com:lintool/warcbase.git</url>
-  </scm>
+    <scm>
+        <connection>scm:git:git@github.com:lintool/warcbase.git</connection>
+        <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection>
+        <url>git@github.com:lintool/warcbase.git</url>
+    </scm>
 
-  <developers>
-    <developer>
-      <id>lintool</id>
-      <name>Jimmy Lin</name>
-      <email>jimmylin@umd.edu</email>
-    </developer>
-    <developer>
-      <id>milad621</id>
-      <name>Milad Gholami</name>
-      <email>mgholami@cs.umd.edu</email>
-    </developer>
-  </developers>
+    <developers>
+        <developer>
+            <id>lintool</id>
+            <name>Jimmy Lin</name>
+            <email>jimmylin@umd.edu</email>
+        </developer>
+        <developer>
+            <id>milad621</id>
+            <name>Milad Gholami</name>
+            <email>mgholami@cs.umd.edu</email>
+        </developer>
+    </developers>
 
-  <parent>
-    <groupId>org.sonatype.oss</groupId>
-    <artifactId>oss-parent</artifactId>
-    <version>7</version>
-  </parent>
+    <parent>
+        <groupId>org.sonatype.oss</groupId>
+        <artifactId>oss-parent</artifactId>
+        <version>7</version>
+    </parent>
 
-  <properties>
-    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
-    <jettyVersion>8.1.12.v20130726</jettyVersion>
-    <hadoop.version>2.0.0-mr1-cdh4.3.0</hadoop.version>
-    <hadoop.version2>2.0.0-cdh4.3.0</hadoop.version2>
-    <hbase.version>0.94.6-cdh4.3.0</hbase.version>
-    <zookeeper.version>3.4.5-cdh4.3.0</zookeeper.version>
-    <jwat.version>1.0.0</jwat.version>
-  </properties>
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
+        <jettyVersion>8.1.12.v20130726</jettyVersion>
+        <hadoop.version>2.0.0-mr1-cdh4.3.0</hadoop.version>
+        <hadoop.version2>2.0.0-cdh4.3.0</hadoop.version2>
+        <hbase.version>0.94.6-cdh4.3.0</hbase.version>
+        <zookeeper.version>3.4.5-cdh4.3.0</zookeeper.version>
+        <jwat.version>1.0.0</jwat.version>
 
-  <build>
-    <plugins>
-          <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-shade-plugin</artifactId>
-            <version>2.1</version>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals>
-                  <goal>shade</goal>
-                </goals>
-                <configuration>
 
-<!-- This fits the issue "Invalid signature file digest for Manifest main attributes"
-  cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html
- -->
-          <filters>
-            <filter>
-              <artifact>*:*</artifact>
-              <excludes>
-                <exclude>META-INF/*.SF</exclude>
-                <exclude>META-INF/*.DSA</exclude>
-                <exclude>META-INF/*.RSA</exclude>
-              </excludes>
-            </filter>
-          </filters>
+        <cdh.version>cdh4.5.0</cdh.version>
+        <pig.version>0.11.0-${cdh.version}</pig.version>
+    </properties>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-shade-plugin</artifactId>
+                <version>2.1</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
 
-                  <!-- this will create both a normal thin jar and also a fatjar -->
-                  <shadedArtifactAttached>true</shadedArtifactAttached>
-                  <shadedClassifierName>fatjar</shadedClassifierName>
-                  <artifactSet>
-                    <excludes>
-                      <exclude>org.apache.hadoop:*</exclude>
-                    </excludes>
-                  </artifactSet>
+                            <!-- This fits the issue "Invalid signature file digest for Manifest main attributes"
+                              cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html
+                             -->
+                            <filters>
+                                <filter>
+                                    <artifact>*:*</artifact>
+                                    <excludes>
+                                        <exclude>META-INF/*.SF</exclude>
+                                        <exclude>META-INF/*.DSA</exclude>
+                                        <exclude>META-INF/*.RSA</exclude>
+                                    </excludes>
+                                </filter>
+                            </filters>
+
+                            <!-- this will create both a normal thin jar and also a fatjar -->
+                            <shadedArtifactAttached>true</shadedArtifactAttached>
+                            <shadedClassifierName>fatjar</shadedClassifierName>
+                            <artifactSet>
+                                <excludes>
+                                    <exclude>org.apache.hadoop:*</exclude>
+                                </excludes>
+                            </artifactSet>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>appassembler-maven-plugin</artifactId>
+                <version>1.3.1</version>
+                <configuration>
+                    <programs>
+                        <program>
+                            <mainClass>org.warcbase.ingest.IngestFiles</mainClass>
+                            <name>IngestFiles</name>
+                        </program>
+                        <program>
+                            <mainClass>org.warcbase.ingest.SearchForUri</mainClass>
+                            <name>SearchForUri</name>
+                        </program>
+                        <program>
+                            <mainClass>org.warcbase.browser.WarcBrowser</mainClass>
+                            <name>WarcBrowser</name>
+                        </program>
+                        <program>
+                            <mainClass>org.warcbase.analysis.CountRowTypes</mainClass>
+                            <name>CountRowTypes</name>
+                        </program>
+                        <program>
+                            <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass>
+                            <name>DetectDuplicates</name>
+                        </program>
+                    </programs>
                 </configuration>
-              </execution>
-            </executions>
-          </plugin>
-      <plugin>
-        <groupId>org.codehaus.mojo</groupId>
-        <artifactId>appassembler-maven-plugin</artifactId>
-        <version>1.3.1</version>
-        <configuration>
-          <programs>
-            <program>
-              <mainClass>org.warcbase.ingest.IngestFiles</mainClass>
-              <name>IngestFiles</name>
-            </program>
-            <program>
-              <mainClass>org.warcbase.ingest.SearchForUri</mainClass>
-              <name>SearchForUri</name>
-            </program>
-            <program>
-              <mainClass>org.warcbase.browser.WarcBrowser</mainClass>
-              <name>WarcBrowser</name>
-            </program>
-            <program>
-              <mainClass>org.warcbase.analysis.CountRowTypes</mainClass>
-              <name>CountRowTypes</name>
-            </program>
-            <program>
-              <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass>
-              <name>DetectDuplicates</name>
-            </program>
-          </programs>
-        </configuration>
-      </plugin>
-      <!-- <plugin>
-        <groupId>org.eclipse.jetty</groupId>
-        <artifactId>jetty-maven-plugin</artifactId>
-        <version>${jettyVersion}</version>
-      </plugin> -->
-    </plugins>
-  </build>
+            </plugin>
+            <!-- <plugin>
+              <groupId>org.eclipse.jetty</groupId>
+              <artifactId>jetty-maven-plugin</artifactId>
+              <version>${jettyVersion}</version>
+            </plugin> -->
+        </plugins>
+    </build>
+
+    <repositories>
+        <repository>
+            <id>internetarchive</id>
+            <name>Internet Archive Maven Repository</name>
+            <url>http://builds.archive.org:8080/maven2</url>
+        </repository>
+        <repository>
+            <id>cloudera</id>
+            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
+        </repository>
+    </repositories>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>4.11</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.pig</groupId>
+            <artifactId>pig</artifactId>
+            <version>0.11.0-cdh4.4.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.google.guava</groupId>
+            <artifactId>guava</artifactId>
+            <version>14.0.1</version>
+        </dependency>
+        <dependency>
+            <groupId>tl.lin</groupId>
+            <artifactId>lintools-datatypes</artifactId>
+            <version>0.9.2</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hbase</groupId>
+            <artifactId>hbase</artifactId>
+            <version>${hbase.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>slf4j-api</groupId>
+                    <artifactId>slf4j-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-api-1.4.3</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>jsp-api</groupId>
+                    <artifactId>jsp-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jsp-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jsp-api-2.1</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>servlet-api-2.5</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>servlet-api</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty-util</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jsp-2.1</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-core</artifactId>
+            <version>${hadoop.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>javax.servlet</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty-util</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jsp-2.1</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hadoop</groupId>
+            <artifactId>hadoop-common</artifactId>
+            <version>${hadoop.version2}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>javax.servlet</groupId>
+                    <artifactId>servlet-api</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jetty-util</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.mortbay.jetty</groupId>
+                    <artifactId>jsp-2.1</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.zookeeper</groupId>
+            <artifactId>zookeeper</artifactId>
+            <version>${zookeeper.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.archive.heritrix</groupId>
+            <artifactId>heritrix-commons</artifactId>
+            <version>3.1.2-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.archive.wayback</groupId>
+            <artifactId>wayback-core</artifactId>
+            <version>1.7.0</version>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-server</artifactId>
+            <version>${jettyVersion}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.eclipse.jetty</groupId>
+            <artifactId>jetty-webapp</artifactId>
+            <version>${jettyVersion}</version>
+            <optional>true</optional>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-log4j12</artifactId>
+            <version>1.6.4</version>
+        </dependency>
+        <dependency>
+            <!-- jsoup HTML parser library @ http://jsoup.org/ -->
+            <groupId>org.jsoup</groupId>
+            <artifactId>jsoup</artifactId>
+            <version>1.7.2</version>
+        </dependency>
+        <!--<dependency>
+              <groupId>org.eclipse.jetty.orbit</groupId>
+              <artifactId>javax.servlet</artifactId>
+              <version>3.0.0.v201112011016</version>
+              <scope>provided</scope>
+            </dependency>-->
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-common</artifactId>
+            <version>${jwat.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-gzip</artifactId>
+            <version>${jwat.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-arc</artifactId>
+            <version>${jwat.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-warc</artifactId>
+            <version>${jwat.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.jwat</groupId>
+            <artifactId>jwat-tools</artifactId>
+            <version>0.5.6-SNAPSHOT</version>
+        </dependency>
+
+
+        <dependency>
+            <groupId>org.apache.pig</groupId>
+            <artifactId>pigunit</artifactId>
+            <version>${pig.version}</version>
+            <scope>test</scope>
+        </dependency>
 
-  <repositories>
-    <repository>
-      <id>internetarchive</id>
-      <name>Internet Archive Maven Repository</name>
-      <url>http://builds.archive.org:8080/maven2</url>
-    </repository>
-    <repository>
-      <id>cloudera</id>
-      <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
-    </repository>
-  </repositories>
+        <!-- This is here as pig has not bundled this parser -->
+        <dependency>
+            <groupId>org.antlr</groupId>
+            <artifactId>antlr</artifactId>
+            <version>3.5.1</version>
+            <scope>test</scope>
+        </dependency>
 
-  <dependencies>
-    <dependency>
-      <groupId>junit</groupId>
-      <artifactId>junit</artifactId>
-      <version>4.11</version>
-      <scope>test</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.pig</groupId>
-      <artifactId>pig</artifactId>
-      <version>0.11.0-cdh4.4.0</version>
-    </dependency>
-    <dependency>
-      <groupId>com.google.guava</groupId>
-      <artifactId>guava</artifactId>
-      <version>14.0.1</version>
-    </dependency>
-    <dependency>
-      <groupId>tl.lin</groupId>
-      <artifactId>lintools-datatypes</artifactId>
-      <version>0.9.2</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hbase</groupId>
-      <artifactId>hbase</artifactId>
-      <version>${hbase.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>slf4j-api</groupId>
-          <artifactId>slf4j-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>slf4j-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.slf4j</groupId>
-          <artifactId>slf4j-api-1.4.3</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>jsp-api</groupId>
-          <artifactId>jsp-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jsp-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jsp-api-2.1</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>servlet-api-2.5</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>servlet-api</groupId>
-          <artifactId>servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty-util</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jsp-2.1</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-core</artifactId>
-      <version>${hadoop.version}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>javax.servlet</groupId>
-          <artifactId>servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty-util</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jsp-2.1</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.hadoop</groupId>
-      <artifactId>hadoop-common</artifactId>
-      <version>${hadoop.version2}</version>
-      <exclusions>
-        <exclusion>
-          <groupId>javax.servlet</groupId>
-          <artifactId>servlet-api</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jetty-util</artifactId>
-        </exclusion>
-        <exclusion>
-          <groupId>org.mortbay.jetty</groupId>
-          <artifactId>jsp-2.1</artifactId>
-        </exclusion>
-      </exclusions>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.zookeeper</groupId>
-      <artifactId>zookeeper</artifactId>
-      <version>${zookeeper.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.archive.heritrix</groupId>
-      <artifactId>heritrix-commons</artifactId>
-      <version>3.1.2-SNAPSHOT</version>
-    </dependency>
-    <dependency>
-      <groupId>org.archive.wayback</groupId>
-      <artifactId>wayback-core</artifactId>
-      <version>1.7.0</version>
-    </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-server</artifactId>
-      <version>${jettyVersion}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.eclipse.jetty</groupId>
-      <artifactId>jetty-webapp</artifactId>
-      <version>${jettyVersion}</version>
-      <optional>true</optional>
-    </dependency>
-    <dependency>
-      <groupId>org.slf4j</groupId>
-      <artifactId>slf4j-log4j12</artifactId>
-      <version>1.6.4</version>                        
-    </dependency>
-    <dependency>
-  <!-- jsoup HTML parser library @ http://jsoup.org/ -->
-  <groupId>org.jsoup</groupId>
-  <artifactId>jsoup</artifactId>
-  <version>1.7.2</version>
-</dependency>
-<!--<dependency>
-      <groupId>org.eclipse.jetty.orbit</groupId>
-      <artifactId>javax.servlet</artifactId>
-      <version>3.0.0.v201112011016</version>
-      <scope>provided</scope>
-    </dependency>-->
-     <dependency>
-    <groupId>org.jwat</groupId>
-    <artifactId>jwat-common</artifactId>
-    <version>${jwat.version}</version>
-  </dependency>
-  <dependency>
-    <groupId>org.jwat</groupId>
-    <artifactId>jwat-gzip</artifactId>
-    <version>${jwat.version}</version>
-  </dependency>
-  <dependency>
-    <groupId>org.jwat</groupId>
-    <artifactId>jwat-arc</artifactId>
-    <version>${jwat.version}</version>
-  </dependency>
-  <dependency>
-    <groupId>org.jwat</groupId>
-    <artifactId>jwat-warc</artifactId>
-    <version>${jwat.version}</version>
-  </dependency>
-  <dependency>
-    <groupId>org.jwat</groupId>
-    <artifactId>jwat-tools</artifactId>
-    <version>0.5.6-SNAPSHOT</version>
-  </dependency>
-  </dependencies>
+    </dependencies>
 </project>
diff --git a/src/test/java/org/warcbase/pig/TestArcLoaderPig.java b/src/test/java/org/warcbase/pig/TestArcLoaderPig.java
new file mode 100644
index 0000000..0be1ca7
--- /dev/null
+++ b/src/test/java/org/warcbase/pig/TestArcLoaderPig.java
@@ -0,0 +1,80 @@
+package org.warcbase.pig;
+
+import com.google.common.io.Files;
+import com.google.common.io.Resources;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.pigunit.PigTest;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+import java.util.Iterator;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: alan
+ */
+public class TestArcLoaderPig {
+
+    private static final Log LOG = LogFactory.getLog(TestArcLoaderPig.class);
+    private File tempDir;
+
+    @Test
+    public void testCountLinks() throws Exception {
+
+        String arcTestDataFile = Resources.getResource("arc/example.arc.gz").getPath();
+        //arcTestDataFile = "/home/alan/Documents/SCAPE/hadoop-hackathon-vienna/web/172-3-20131012143440-00001-prepc2.arc.gz";
+
+        String pigFile = Resources.getResource("scripts/TestCountLinks.pig").getPath();
+        String location = tempDir.getPath().replaceAll("\\\\", "/");  // make it work on windows
+
+        PigTest test = new PigTest(pigFile, new String[]{
+                "testArcFolder=" + arcTestDataFile,
+                "experimentfolder=" + location});
+
+        Iterator<Tuple> parses = test.getAlias("a");
+
+        while (parses.hasNext()) {
+            System.out.println("date + count in arc file: " + parses.next());
+        }
+
+    }
+
+    @Test
+    public void testArcLoader() throws Exception {
+
+        String arcTestDataFile = Resources.getResource("arc/example.arc.gz").getPath();
+        //arcTestDataFile = "/home/alan/Documents/SCAPE/hadoop-hackathon-vienna/web/172-3-20131012143440-00001-prepc2.arc.gz";
+
+        String pigFile = Resources.getResource("scripts/TestArcLoader.pig").getPath();
+        String location = tempDir.getPath().replaceAll("\\\\", "/");  // make it work on windows
+
+        PigTest test = new PigTest(pigFile, new String[]{
+                "testArcFolder=" + arcTestDataFile,
+                "experimentfolder=" + location});
+
+        Iterator<Tuple> parses = test.getAlias("c");
+
+        while (parses.hasNext()) {
+            System.out.println("date + count in arc file: " + parses.next());
+        }
+
+    }
+
+    @Before
+    public void setUp() throws Exception {
+        // create a random file location
+        tempDir = Files.createTempDir();
+        LOG.info("Output can be found in " + tempDir.getPath());
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        // cleanup
+        //  FileUtils.deleteRecursive(tempDir);
+    }
+
+}
diff --git a/src/test/resources/arc/example.arc.gz b/src/test/resources/arc/example.arc.gz
new file mode 100644
index 0000000..6498580
Binary files /dev/null and b/src/test/resources/arc/example.arc.gz differ
diff --git a/src/test/resources/scripts/TestArcLoader.pig b/src/test/resources/scripts/TestArcLoader.pig
new file mode 100644
index 0000000..b5f1f3f
--- /dev/null
+++ b/src/test/resources/scripts/TestArcLoader.pig
@@ -0,0 +1,15 @@
+-- Simple word count example to tally up dates when pages are crawled
+
+--register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
+
+DEFINE ArcLoader org.warcbase.pig.ArcLoader();
+
+raw = load '$testArcFolder' using ArcLoader as (url: chararray, date:chararray, mime:chararray, content:chararray);
+
+store raw into '$experimentfolder/raw' using PigStorage();
+
+a = foreach raw generate SUBSTRING(date, 0, 8) as date;
+b = group a by date;
+c = foreach b generate group, COUNT(a);
+
+store c into '$experimentfolder/c' using PigStorage();
\ No newline at end of file
diff --git a/src/test/resources/scripts/TestCountLinks.pig b/src/test/resources/scripts/TestCountLinks.pig
new file mode 100644
index 0000000..ea8c732
--- /dev/null
+++ b/src/test/resources/scripts/TestCountLinks.pig
@@ -0,0 +1,11 @@
+-- Simple word count example to tally up dates when pages are crawled
+
+--register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar';
+
+DEFINE ArcLoader org.warcbase.pig.ArcLoader();
+
+raw = load '$testArcFolder' using ArcLoader as (url: chararray, date:chararray, mime:chararray, content:chararray);
+
+a = foreach raw generate FLATTEN(org.warcbase.pig.piggybank.ExtractLinks(content));
+
+store a into '$experimentfolder/a';
\ No newline at end of file