Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F70171956
CountArcCrawlDates.java
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Jul 5, 14:26
Size
1 KB
Mime Type
text/x-c
Expires
Sun, Jul 7, 14:26 (2 d)
Engine
blob
Format
Raw Data
Handle
18798007
Attached To
R1473 warcbase
CountArcCrawlDates.java
View Options
package
org.warcbase.analysis
;
import
java.io.IOException
;
import
java.util.Arrays
;
import
org.apache.hadoop.io.IntWritable
;
import
org.apache.hadoop.io.LongWritable
;
import
org.apache.hadoop.io.Text
;
import
org.apache.hadoop.mapreduce.Mapper
;
import
org.apache.hadoop.util.Tool
;
import
org.apache.hadoop.util.ToolRunner
;
import
org.apache.log4j.Logger
;
import
org.jwat.arc.ArcRecordBase
;
public
class
CountArcCrawlDates
{
private
static
final
Logger
LOG
=
Logger
.
getLogger
(
CountArcCrawlDates
.
class
);
private
static
enum
Records
{
TOTAL
,
ERROR
};
public
static
class
MyMapper
extends
Mapper
<
LongWritable
,
ArcRecordBase
,
Text
,
IntWritable
>
{
private
static
final
IntWritable
ONE
=
new
IntWritable
(
1
);
@Override
public
void
map
(
LongWritable
key
,
ArcRecordBase
record
,
Context
context
)
throws
IOException
,
InterruptedException
{
context
.
getCounter
(
Records
.
TOTAL
).
increment
(
1
);
// Get the crawl day (ignore hour, minute, second)
String
date
=
record
.
getArchiveDateStr
();
if
(
date
.
length
()
<
8
)
{
context
.
getCounter
(
Records
.
ERROR
).
increment
(
1
);
}
else
{
date
=
date
.
substring
(
0
,
8
);
context
.
write
(
new
Text
(
date
),
ONE
);
}
}
}
/**
* Dispatches command-line arguments to the tool via the <code>ToolRunner</code>.
*/
public
static
void
main
(
String
[]
args
)
throws
Exception
{
LOG
.
info
(
"Running "
+
CountArcCrawlDates
.
class
.
getCanonicalName
()
+
" with args "
+
Arrays
.
toString
(
args
));
Tool
tool
=
new
ArcCounter
(
MyMapper
.
class
);
ToolRunner
.
run
(
tool
,
args
);
}
}
Event Timeline
Log In to Comment