Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F64065642
ExtractEntitiesTest.scala
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, May 24, 08:27
Size
1 KB
Mime Type
text/x-c++
Expires
Sun, May 26, 08:27 (2 d)
Engine
blob
Format
Raw Data
Handle
17851077
Attached To
R1473 warcbase
ExtractEntitiesTest.scala
View Options
package org.warcbase.spark.matchbox
import java.io.File
import com.google.common.io.{Files, Resources}
import org.apache.commons.io.FileUtils
import org.apache.commons.logging.LogFactory
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class ExtractEntitiesTest extends FunSuite with BeforeAndAfter {
private val LOG = LogFactory.getLog(classOf[ExtractEntitiesTest])
private val scrapePath = Resources.getResource("ner/example.txt").getPath
private val master = "local[4]"
private val appName = "example-spark"
private var sc: SparkContext = _
private var tempDir: File = _
before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
sc = new SparkContext(conf)
tempDir = Files.createTempDir()
LOG.info("Output can be found in " + tempDir.getPath)
}
test("extract entities") {
val e = ExtractEntities.extractFromScrapeText(scrapePath, tempDir + "/scrapeTextEntities", sc).take(3).last
assert(e._1 == "20080430")
assert(e._2 == "http://www.archive.org/robots.txt")
assert(e._3 == "{PERSON=[]ORGANIZATION=[]LOCATION=[Teoma]O=[]}")
}
after {
FileUtils.deleteDirectory(tempDir)
LOG.info("Removing tmp files in " + tempDir.getPath)
if (sc != null) {
sc.stop()
}
}
}
Event Timeline
Log In to Comment