diff --git a/pom.xml b/pom.xml index 136e0d5..ec3448c 100644 --- a/pom.xml +++ b/pom.xml @@ -1,337 +1,359 @@ -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - <groupId>org.warcbase</groupId> - <artifactId>warcbase</artifactId> - <packaging>jar</packaging> - <version>0.1.0-SNAPSHOT</version> - <name>warcbase</name> - <description>WARC + HBase</description> - <url>http://warcbase.org/</url> + <modelVersion>4.0.0</modelVersion> + <groupId>org.warcbase</groupId> + <artifactId>warcbase</artifactId> + <packaging>jar</packaging> + <version>0.1.0-SNAPSHOT</version> + <name>warcbase</name> + <description>WARC + HBase</description> + <url>http://warcbase.org/</url> - <licenses> - <license> - <name>The Apache Software License, Version 2.0</name> - <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> - <distribution>repo</distribution> - </license> - </licenses> + <licenses> + <license> + <name>The Apache Software License, Version 2.0</name> + <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> + <distribution>repo</distribution> + </license> + </licenses> - <scm> - <connection>scm:git:git@github.com:lintool/warcbase.git</connection> - <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection> - <url>git@github.com:lintool/warcbase.git</url> - </scm> + <scm> + <connection>scm:git:git@github.com:lintool/warcbase.git</connection> + <developerConnection>scm:git:git@github.com:lintool/warcbase.git</developerConnection> + <url>git@github.com:lintool/warcbase.git</url> + </scm> - <developers> - <developer> - <id>lintool</id> - <name>Jimmy Lin</name> - <email>jimmylin@umd.edu</email> - </developer> - <developer> - <id>milad621</id> - <name>Milad Gholami</name> - <email>mgholami@cs.umd.edu</email> - </developer> - </developers> + <developers> + <developer> + <id>lintool</id> + <name>Jimmy Lin</name> + <email>jimmylin@umd.edu</email> + </developer> + <developer> + <id>milad621</id> + <name>Milad Gholami</name> + <email>mgholami@cs.umd.edu</email> + </developer> + </developers> - <parent> - <groupId>org.sonatype.oss</groupId> - <artifactId>oss-parent</artifactId> - <version>7</version> - </parent> + <parent> + <groupId>org.sonatype.oss</groupId> + <artifactId>oss-parent</artifactId> + <version>7</version> + </parent> - <properties> - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> - <jettyVersion>8.1.12.v20130726</jettyVersion> - <hadoop.version>2.0.0-mr1-cdh4.3.0</hadoop.version> - <hadoop.version2>2.0.0-cdh4.3.0</hadoop.version2> - <hbase.version>0.94.6-cdh4.3.0</hbase.version> - <zookeeper.version>3.4.5-cdh4.3.0</zookeeper.version> - <jwat.version>1.0.0</jwat.version> - </properties> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> + <jettyVersion>8.1.12.v20130726</jettyVersion> + <hadoop.version>2.0.0-mr1-cdh4.3.0</hadoop.version> + <hadoop.version2>2.0.0-cdh4.3.0</hadoop.version2> + <hbase.version>0.94.6-cdh4.3.0</hbase.version> + <zookeeper.version>3.4.5-cdh4.3.0</zookeeper.version> + <jwat.version>1.0.0</jwat.version> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-shade-plugin</artifactId> - <version>2.1</version> - <executions> - <execution> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - <configuration> -<!-- This fits the issue "Invalid signature file digest for Manifest main attributes" - cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html - --> - <filters> - <filter> - <artifact>*:*</artifact> - <excludes> - <exclude>META-INF/*.SF</exclude> - <exclude>META-INF/*.DSA</exclude> - <exclude>META-INF/*.RSA</exclude> - </excludes> - </filter> - </filters> + <cdh.version>cdh4.5.0</cdh.version> + <pig.version>0.11.0-${cdh.version}</pig.version> + </properties> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.1</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> - <!-- this will create both a normal thin jar and also a fatjar --> - <shadedArtifactAttached>true</shadedArtifactAttached> - <shadedClassifierName>fatjar</shadedClassifierName> - <artifactSet> - <excludes> - <exclude>org.apache.hadoop:*</exclude> - </excludes> - </artifactSet> + <!-- This fits the issue "Invalid signature file digest for Manifest main attributes" + cf. http://zhentao-li.blogspot.com/2012/06/maven-shade-plugin-invalid-signature.html + --> + <filters> + <filter> + <artifact>*:*</artifact> + <excludes> + <exclude>META-INF/*.SF</exclude> + <exclude>META-INF/*.DSA</exclude> + <exclude>META-INF/*.RSA</exclude> + </excludes> + </filter> + </filters> + + <!-- this will create both a normal thin jar and also a fatjar --> + <shadedArtifactAttached>true</shadedArtifactAttached> + <shadedClassifierName>fatjar</shadedClassifierName> + <artifactSet> + <excludes> + <exclude>org.apache.hadoop:*</exclude> + </excludes> + </artifactSet> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>appassembler-maven-plugin</artifactId> + <version>1.3.1</version> + <configuration> + <programs> + <program> + <mainClass>org.warcbase.ingest.IngestFiles</mainClass> + <name>IngestFiles</name> + </program> + <program> + <mainClass>org.warcbase.ingest.SearchForUri</mainClass> + <name>SearchForUri</name> + </program> + <program> + <mainClass>org.warcbase.browser.WarcBrowser</mainClass> + <name>WarcBrowser</name> + </program> + <program> + <mainClass>org.warcbase.analysis.CountRowTypes</mainClass> + <name>CountRowTypes</name> + </program> + <program> + <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass> + <name>DetectDuplicates</name> + </program> + </programs> </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.codehaus.mojo</groupId> - <artifactId>appassembler-maven-plugin</artifactId> - <version>1.3.1</version> - <configuration> - <programs> - <program> - <mainClass>org.warcbase.ingest.IngestFiles</mainClass> - <name>IngestFiles</name> - </program> - <program> - <mainClass>org.warcbase.ingest.SearchForUri</mainClass> - <name>SearchForUri</name> - </program> - <program> - <mainClass>org.warcbase.browser.WarcBrowser</mainClass> - <name>WarcBrowser</name> - </program> - <program> - <mainClass>org.warcbase.analysis.CountRowTypes</mainClass> - <name>CountRowTypes</name> - </program> - <program> - <mainClass>org.warcbase.analysis.DetectDuplicates</mainClass> - <name>DetectDuplicates</name> - </program> - </programs> - </configuration> - </plugin> - <!-- <plugin> - <groupId>org.eclipse.jetty</groupId> - <artifactId>jetty-maven-plugin</artifactId> - <version>${jettyVersion}</version> - </plugin> --> - </plugins> - </build> + </plugin> + <!-- <plugin> + <groupId>org.eclipse.jetty</groupId> + <artifactId>jetty-maven-plugin</artifactId> + <version>${jettyVersion}</version> + </plugin> --> + </plugins> + </build> + + <repositories> + <repository> + <id>internetarchive</id> + <name>Internet Archive Maven Repository</name> + <url>http://builds.archive.org:8080/maven2</url> + </repository> + <repository> + <id>cloudera</id> + <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> + </repository> + </repositories> + + <dependencies> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.11</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.pig</groupId> + <artifactId>pig</artifactId> + <version>0.11.0-cdh4.4.0</version> + </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>14.0.1</version> + </dependency> + <dependency> + <groupId>tl.lin</groupId> + <artifactId>lintools-datatypes</artifactId> + <version>0.9.2</version> + </dependency> + <dependency> + <groupId>org.apache.hbase</groupId> + <artifactId>hbase</artifactId> + <version>${hbase.version}</version> + <exclusions> + <exclusion> + <groupId>slf4j-api</groupId> + <artifactId>slf4j-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api-1.4.3</artifactId> + </exclusion> + <exclusion> + <groupId>jsp-api</groupId> + <artifactId>jsp-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jsp-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jsp-api-2.1</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>servlet-api-2.5</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>servlet-api</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jsp-2.1</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-core</artifactId> + <version>${hadoop.version}</version> + <exclusions> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jsp-2.1</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version2}</version> + <exclusions> + <exclusion> + <groupId>javax.servlet</groupId> + <artifactId>servlet-api</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jetty-util</artifactId> + </exclusion> + <exclusion> + <groupId>org.mortbay.jetty</groupId> + <artifactId>jsp-2.1</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.apache.zookeeper</groupId> + <artifactId>zookeeper</artifactId> + <version>${zookeeper.version}</version> + </dependency> + <dependency> + <groupId>org.archive.heritrix</groupId> + <artifactId>heritrix-commons</artifactId> + <version>3.1.2-SNAPSHOT</version> + </dependency> + <dependency> + <groupId>org.archive.wayback</groupId> + <artifactId>wayback-core</artifactId> + <version>1.7.0</version> + </dependency> + <dependency> + <groupId>org.eclipse.jetty</groupId> + <artifactId>jetty-server</artifactId> + <version>${jettyVersion}</version> + </dependency> + <dependency> + <groupId>org.eclipse.jetty</groupId> + <artifactId>jetty-webapp</artifactId> + <version>${jettyVersion}</version> + <optional>true</optional> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <version>1.6.4</version> + </dependency> + <dependency> + <!-- jsoup HTML parser library @ http://jsoup.org/ --> + <groupId>org.jsoup</groupId> + <artifactId>jsoup</artifactId> + <version>1.7.2</version> + </dependency> + <!--<dependency> + <groupId>org.eclipse.jetty.orbit</groupId> + <artifactId>javax.servlet</artifactId> + <version>3.0.0.v201112011016</version> + <scope>provided</scope> + </dependency>--> + <dependency> + <groupId>org.jwat</groupId> + <artifactId>jwat-common</artifactId> + <version>${jwat.version}</version> + </dependency> + <dependency> + <groupId>org.jwat</groupId> + <artifactId>jwat-gzip</artifactId> + <version>${jwat.version}</version> + </dependency> + <dependency> + <groupId>org.jwat</groupId> + <artifactId>jwat-arc</artifactId> + <version>${jwat.version}</version> + </dependency> + <dependency> + <groupId>org.jwat</groupId> + <artifactId>jwat-warc</artifactId> + <version>${jwat.version}</version> + </dependency> + <dependency> + <groupId>org.jwat</groupId> + <artifactId>jwat-tools</artifactId> + <version>0.5.6-SNAPSHOT</version> + </dependency> + + + <dependency> + <groupId>org.apache.pig</groupId> + <artifactId>pigunit</artifactId> + <version>${pig.version}</version> + <scope>test</scope> + </dependency> - <repositories> - <repository> - <id>internetarchive</id> - <name>Internet Archive Maven Repository</name> - <url>http://builds.archive.org:8080/maven2</url> - </repository> - <repository> - <id>cloudera</id> - <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url> - </repository> - </repositories> + <!-- This is here as pig has not bundled this parser --> + <dependency> + <groupId>org.antlr</groupId> + <artifactId>antlr</artifactId> + <version>3.5.1</version> + <scope>test</scope> + </dependency> - <dependencies> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>4.11</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.pig</groupId> - <artifactId>pig</artifactId> - <version>0.11.0-cdh4.4.0</version> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <version>14.0.1</version> - </dependency> - <dependency> - <groupId>tl.lin</groupId> - <artifactId>lintools-datatypes</artifactId> - <version>0.9.2</version> - </dependency> - <dependency> - <groupId>org.apache.hbase</groupId> - <artifactId>hbase</artifactId> - <version>${hbase.version}</version> - <exclusions> - <exclusion> - <groupId>slf4j-api</groupId> - <artifactId>slf4j-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api-1.4.3</artifactId> - </exclusion> - <exclusion> - <groupId>jsp-api</groupId> - <artifactId>jsp-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jsp-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jsp-api-2.1</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>servlet-api-2.5</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>servlet-api</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty-util</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jsp-2.1</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-core</artifactId> - <version>${hadoop.version}</version> - <exclusions> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty-util</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jsp-2.1</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version2}</version> - <exclusions> - <exclusion> - <groupId>javax.servlet</groupId> - <artifactId>servlet-api</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jetty-util</artifactId> - </exclusion> - <exclusion> - <groupId>org.mortbay.jetty</groupId> - <artifactId>jsp-2.1</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.zookeeper</groupId> - <artifactId>zookeeper</artifactId> - <version>${zookeeper.version}</version> - </dependency> - <dependency> - <groupId>org.archive.heritrix</groupId> - <artifactId>heritrix-commons</artifactId> - <version>3.1.2-SNAPSHOT</version> - </dependency> - <dependency> - <groupId>org.archive.wayback</groupId> - <artifactId>wayback-core</artifactId> - <version>1.7.0</version> - </dependency> - <dependency> - <groupId>org.eclipse.jetty</groupId> - <artifactId>jetty-server</artifactId> - <version>${jettyVersion}</version> - </dependency> - <dependency> - <groupId>org.eclipse.jetty</groupId> - <artifactId>jetty-webapp</artifactId> - <version>${jettyVersion}</version> - <optional>true</optional> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - <version>1.6.4</version> - </dependency> - <dependency> - <!-- jsoup HTML parser library @ http://jsoup.org/ --> - <groupId>org.jsoup</groupId> - <artifactId>jsoup</artifactId> - <version>1.7.2</version> -</dependency> -<!--<dependency> - <groupId>org.eclipse.jetty.orbit</groupId> - <artifactId>javax.servlet</artifactId> - <version>3.0.0.v201112011016</version> - <scope>provided</scope> - </dependency>--> - <dependency> - <groupId>org.jwat</groupId> - <artifactId>jwat-common</artifactId> - <version>${jwat.version}</version> - </dependency> - <dependency> - <groupId>org.jwat</groupId> - <artifactId>jwat-gzip</artifactId> - <version>${jwat.version}</version> - </dependency> - <dependency> - <groupId>org.jwat</groupId> - <artifactId>jwat-arc</artifactId> - <version>${jwat.version}</version> - </dependency> - <dependency> - <groupId>org.jwat</groupId> - <artifactId>jwat-warc</artifactId> - <version>${jwat.version}</version> - </dependency> - <dependency> - <groupId>org.jwat</groupId> - <artifactId>jwat-tools</artifactId> - <version>0.5.6-SNAPSHOT</version> - </dependency> - </dependencies> + </dependencies> </project> diff --git a/src/test/java/org/warcbase/pig/TestArcLoaderPig.java b/src/test/java/org/warcbase/pig/TestArcLoaderPig.java new file mode 100644 index 0000000..0be1ca7 --- /dev/null +++ b/src/test/java/org/warcbase/pig/TestArcLoaderPig.java @@ -0,0 +1,80 @@ +package org.warcbase.pig; + +import com.google.common.io.Files; +import com.google.common.io.Resources; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.pig.data.Tuple; +import org.apache.pig.pigunit.PigTest; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.util.Iterator; + +/** + * Created with IntelliJ IDEA. + * User: alan + */ +public class TestArcLoaderPig { + + private static final Log LOG = LogFactory.getLog(TestArcLoaderPig.class); + private File tempDir; + + @Test + public void testCountLinks() throws Exception { + + String arcTestDataFile = Resources.getResource("arc/example.arc.gz").getPath(); + //arcTestDataFile = "/home/alan/Documents/SCAPE/hadoop-hackathon-vienna/web/172-3-20131012143440-00001-prepc2.arc.gz"; + + String pigFile = Resources.getResource("scripts/TestCountLinks.pig").getPath(); + String location = tempDir.getPath().replaceAll("\\\\", "/"); // make it work on windows + + PigTest test = new PigTest(pigFile, new String[]{ + "testArcFolder=" + arcTestDataFile, + "experimentfolder=" + location}); + + Iterator<Tuple> parses = test.getAlias("a"); + + while (parses.hasNext()) { + System.out.println("date + count in arc file: " + parses.next()); + } + + } + + @Test + public void testArcLoader() throws Exception { + + String arcTestDataFile = Resources.getResource("arc/example.arc.gz").getPath(); + //arcTestDataFile = "/home/alan/Documents/SCAPE/hadoop-hackathon-vienna/web/172-3-20131012143440-00001-prepc2.arc.gz"; + + String pigFile = Resources.getResource("scripts/TestArcLoader.pig").getPath(); + String location = tempDir.getPath().replaceAll("\\\\", "/"); // make it work on windows + + PigTest test = new PigTest(pigFile, new String[]{ + "testArcFolder=" + arcTestDataFile, + "experimentfolder=" + location}); + + Iterator<Tuple> parses = test.getAlias("c"); + + while (parses.hasNext()) { + System.out.println("date + count in arc file: " + parses.next()); + } + + } + + @Before + public void setUp() throws Exception { + // create a random file location + tempDir = Files.createTempDir(); + LOG.info("Output can be found in " + tempDir.getPath()); + } + + @After + public void tearDown() throws Exception { + // cleanup + // FileUtils.deleteRecursive(tempDir); + } + +} diff --git a/src/test/resources/arc/example.arc.gz b/src/test/resources/arc/example.arc.gz new file mode 100644 index 0000000..6498580 Binary files /dev/null and b/src/test/resources/arc/example.arc.gz differ diff --git a/src/test/resources/scripts/TestArcLoader.pig b/src/test/resources/scripts/TestArcLoader.pig new file mode 100644 index 0000000..b5f1f3f --- /dev/null +++ b/src/test/resources/scripts/TestArcLoader.pig @@ -0,0 +1,15 @@ +-- Simple word count example to tally up dates when pages are crawled + +--register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; + +DEFINE ArcLoader org.warcbase.pig.ArcLoader(); + +raw = load '$testArcFolder' using ArcLoader as (url: chararray, date:chararray, mime:chararray, content:chararray); + +store raw into '$experimentfolder/raw' using PigStorage(); + +a = foreach raw generate SUBSTRING(date, 0, 8) as date; +b = group a by date; +c = foreach b generate group, COUNT(a); + +store c into '$experimentfolder/c' using PigStorage(); \ No newline at end of file diff --git a/src/test/resources/scripts/TestCountLinks.pig b/src/test/resources/scripts/TestCountLinks.pig new file mode 100644 index 0000000..ea8c732 --- /dev/null +++ b/src/test/resources/scripts/TestCountLinks.pig @@ -0,0 +1,11 @@ +-- Simple word count example to tally up dates when pages are crawled + +--register 'target/warcbase-0.1.0-SNAPSHOT-fatjar.jar'; + +DEFINE ArcLoader org.warcbase.pig.ArcLoader(); + +raw = load '$testArcFolder' using ArcLoader as (url: chararray, date:chararray, mime:chararray, content:chararray); + +a = foreach raw generate FLATTEN(org.warcbase.pig.piggybank.ExtractLinks(content)); + +store a into '$experimentfolder/a'; \ No newline at end of file