Page MenuHomec4science

websearch_external_collections_getter.py
No OneTemporary

File Metadata

Created
Fri, Jun 28, 20:50

websearch_external_collections_getter.py

# -*- coding: utf-8 -*-
## This file is part of Invenio.
## Copyright (C) 2006, 2007, 2008, 2009, 2010, 2011,  CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Module to download web pages using asyncore.
Example 1, downloading a set of webpages :
from websearch_external_collections_getter import *
urls = ['http://www.google.fr', 'http://linuxfr.org']
pagegetters = [HTTPAsyncPageGetter(url) for url in urls]
async_download(pagegetters)
for pagegetter in pagegetters:
if pagegetter.done:
print pagegetter.data
else:
print "Error downloading : " + pagegetter.uri
Example 2, downloading a set of webpages but with callback function.
def func(pagegetter, data, current_time):
print "OK (%f): " % current_time + pagegetter.uri + " - " + data
from websearch_external_collections_getter import *
urls = ['http://www.google.fr', 'http://linuxfr.org']
pagegetters = [HTTPAsyncPageGetter(url) for url in urls]
async_download(pagegetters, func, ['info1', 'info2'], 10)
"""
__revision__ = "$Id$"
import asyncore
import mimetools
import socket
import sys
import StringIO
import time
import urlparse
#from invenio.websearch_external_collections_config import CFG_EXTERNAL_COLLECTION_TIMEOUT
from invenio.config import CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT
CFG_EXTERNAL_COLLECTION_TIMEOUT = CFG_WEBSEARCH_EXTERNAL_COLLECTION_SEARCH_TIMEOUT
def async_download(pagegetter_list, finish_function=None, datastructure_list=None, timeout=15):
"""Download web pages asynchronously with timeout.
pagegetter_list : list of HTTPAsyncPageGetter objects
finish_function : function called when a web page is downloaded;
prototype def funct(pagetter, datastructure, current_time)
datastructure_list : list (same size as pagegetter_list) with information to pass as datastructure
to the finish function.
timeout : float, timeout in seconds."""
time_start = time.time()
finished_list = [False] * len(pagegetter_list)
nb_remaining = 0
check_redirected(pagegetter_list)
for pagegetter in pagegetter_list:
if pagegetter and not pagegetter.done:
nb_remaining += 1
while (time.time() - time_start < timeout) and nb_remaining > 0:
if sys.hexversion < 0x2040000:
asyncore.poll(0.01)
else:
asyncore.loop(0.01, True, None, 1)
check_redirected(pagegetter_list)
for i in range(len(pagegetter_list)):
if pagegetter_list[i] and not finished_list[i] and pagegetter_list[i].done:
nb_remaining -= 1
if finish_function:
if datastructure_list:
datastructure = datastructure_list[i]
else:
datastructure = None
current_time = time.time() - time_start
finish_function(pagegetter_list[i], datastructure, current_time)
finished_list[i] = True
return finished_list
class HTTPAsyncPageGetter(asyncore.dispatcher_with_send):
"""Class to download a web page using asyncore."""
def __init__(self, uri):
asyncore.dispatcher_with_send.__init__(self)
self.uri = uri
self.redirected = None
self.status = None
self.header = None
self.done = False
self.data = ""
self.header_data = ""
self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
self.request, self.host, self.port = build_request(self.uri)
try:
self.connect((self.host, self.port))
except:
self.done = True
def handle_connect(self):
"""Handle the connection event. By sending the request to the server."""
try:
self.send(self.request)
except socket.error:
# do nothing because self.done is false by default
pass
def handle_expt(self):
"""Handle an exception. Close the socket and put done at True."""
self.close()
self.done = True
def handle_read(self):
"""Handle a read event."""
data = self.recv(1024)
if not self.header:
self.header_data += data
(self.status, self.header, data) = decode_header(self.header_data)
if self.status is not None:
if self.status[1] in ("301", "302"):
self.redirected = self.header["location"]
self.data += data
def handle_close(self):
"""Handle a close event."""
self.done = True
self.close()
def build_request(uri):
"""Build an http request for a specific url."""
scheme, host, path, params, query, dummy = urlparse.urlparse(uri)
assert scheme == "http", "only supports HTTP requests (uri = " + uri + ")"
host, port = decode_host_port(host)
path = encode_path(path, params, query)
request = "GET %s HTTP/1.0\r\n" % (path) + \
"User-Agent: Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/48 (like Gecko) Safari/48\r\n" + \
"Accept: text/html, image/jpeg, image/png, text/*, image/*, */*\r\n" + \
"Accept-Charset: utf-8, utf-8;q=0.5, *;q=0.5\r\n" + \
"Host: %s\r\n" % (host) + \
"Connection: close\r\n\r\n"
return (request, host, port)
def decode_host_port(host):
"""Decode the host string in an (host, port) pair."""
try:
host, port = host.split(":", 1)
port = int(port)
except (TypeError, ValueError):
port = 80
return (host, port)
def encode_path(path, params, query):
"""Bind the path, the params and the query in a unique http path."""
if not path:
path = "/"
if params:
path = path + ";" + params
if query:
path = path + "?" + query
return path
def decode_header(data):
"""Try to decode an html header.
If the header can be decoded, will return (status, header, remaining_data)
If it cannot, (None, None, data)
"""
i = data.find("\r\n\r\n")
size = 4
if i == -1:
i = data.find("\n\n")
size = 2
if i == -1:
return (None, None, data)
# parse header
header_fp = StringIO.StringIO(data[:i+size])
# status line is "HTTP/version status message"
status = header_fp.readline()
status = status.split(" ", 2)
# followed by a rfc822-style message header
header = mimetools.Message(header_fp)
# followed by a newline, and the payload (if any)
data = data[i+size:]
return (status, header, data)
def check_redirected(pagegetter_list):
"""Check if a redirection occured in the engines_list."""
for i in range(len(pagegetter_list)):
getter = pagegetter_list[i]
if getter and getter.redirected is not None:
if getter.redirected.startswith('http://'):
getter = HTTPAsyncPageGetter(getter.redirected)
else:
getter.done = True
pagegetter_list[i] = getter
def fetch_url_content(urls, timeout=CFG_EXTERNAL_COLLECTION_TIMEOUT):
"""Given a list of urls this function returns a list of their contents
using a optional custom timeout."""
urls_content = []
try:
pagegetters_list = [HTTPAsyncPageGetter(url) for url in urls]
except AssertionError:
return [None] * len(urls)
async_download(pagegetters_list, None, None, timeout)
for i in range(len(pagegetters_list)):
if pagegetters_list[i].done: urls_content.append(pagegetters_list[i].data)
else: urls_content.append(None)
return urls_content

Event Timeline