diff --git a/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala b/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala index 43e0aeb..a0313a4 100644 --- a/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala +++ b/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/ExtractLinks.scala @@ -1,59 +1,62 @@ /* * Warcbase: an open-source platform for managing web archives * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.warcbase.spark.matchbox import java.io.IOException import org.jsoup.Jsoup import org.jsoup.select.Elements import scala.collection.mutable /** * UDF for extracting links from a webpage given the HTML content (using Jsoup). - * */ object ExtractLinks { /** * @param src the src link. * @param html the content from which links are to be extracted. * @param base an optional base URI. * * Returns a sequence of (source, target, anchortext) */ def apply(src: String, html: String, base: String = ""): Seq[(String, String, String)] = { - if (html.isEmpty) return Nil try { val output = mutable.MutableList[(String, String, String)]() + + // Basic input checking, return empty list if we fail. + if (src == null) return output + if (html.isEmpty) return output + val doc = Jsoup.parse(html) val links: Elements = doc.select("a[href]") val it = links.iterator() while (it.hasNext) { val link = it.next() if (base.nonEmpty) link.setBaseUri(base) val target = link.attr("abs:href") if (target.nonEmpty) { output += ((src, target, link.text)) } } output } catch { case e: Exception => throw new IOException("Caught exception processing input ", e); } } } diff --git a/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala b/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala index 5904dc6..710e031 100644 --- a/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala +++ b/warcbase-core/src/main/scala/org/warcbase/spark/matchbox/StringUtils.scala @@ -1,27 +1,28 @@ /* * Warcbase: an open-source platform for managing web archives * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.warcbase.spark.matchbox object StringUtils { implicit class WWWLink(s: String) { def removePrefixWWW(): String = { + if (s == null) return null s.replaceAll("^\\s*www\\.", "") } } }