Page MenuHomec4science

bibharvest.wml
No OneTemporary

File Metadata

Created
Mon, Jul 29, 12:50

bibharvest.wml

## $Id$
## This file is part of the CERN Document Server Software (CDSware).
## Copyright (C) 2002, 2003, 2004, 2005 CERN.
##
## The CDSware is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## The CDSware is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDSware; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
## read config variables:
#include "config.wml"
#include "configbis.wml"
#include "cdswmllib.wml"
## start Python:
<protect>#!</protect><PYTHON>
<protect>## $Id$</protect>
<protect>## DO NOT EDIT THIS FILE! IT WAS AUTOMATICALLY GENERATED FROM CDSware WML SOURCES.</protect>
"""CDSware OAI harvestor."""
__version__ = "<: print generate_pretty_version_string('$Id$'); :>"
## okay, rest of the Python code goes below
#######
<protect>
try:
import httplib
import urllib
import sys
import re
import string
import getopt
import time
except ImportError, e:
print "Error: %s" % e
import sys
sys.exit(1)
http_response_status_code = {
"000" : "Unknown",
"100" : "Continue",
"200" : "OK",
"302" : "Redirect",
"403" : "Forbidden",
"404" : "Not Found",
"500" : "Error",
"503" : "Service Unavailable"
}
def http_param_resume(http_param_dict,resumptionToken):
"Change parameter dictionary for harvest resumption"
http_param = {
'verb' : http_param_dict['verb'],
'resumptionToken' : resumptionToken
}
return http_param
def http_request_parameters(http_param_dict, method="POST"):
"Assembly http request parameters for http method used"
params = ""
if method == "GET":
for key in http_param_dict.keys():
if params:
params = "%s&" % (params)
if key:
params = "%s%s=%s" % (params, key, http_param_dict[key])
elif method == "POST":
http_param = {}
for key in http_param_dict.keys():
if http_param_dict[key]:
http_param[key] = http_param_dict[key]
params = urllib.urlencode(http_param)
return params
def OAI_Session(server, script, http_param_dict ,method="POST",output="", stylesheet=""):
"Handle OAi session"
sys.stderr.write("Starting the harvesting session at %s" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
sys.stderr.write("%s - %s\n" % (server, http_request_parameters(http_param_dict)))
a = OAI_Request(server, script, http_request_parameters(http_param_dict, method), method)
rt_obj = re.search('>.*</resumptionToken>',a)
i = 0
while rt_obj != None and rt_obj !="":
if output:
write_file( "%s.%07d" % (output,i), a)
else:
sys.stdout.write(a)
i = i + 1
time.sleep(1)
http_param_dict = http_param_resume(http_param_dict,rt_obj.group()[1:-18])
a = OAI_Request(server, script, http_request_parameters(http_param_dict, method), method)
rt_obj = re.search('>.*</resumptionToken>',a)
if output:
write_file("%s.%07d" % (output,i),a)
else:
sys.stdout.write(a)
def write_file(filename="harvest",a=""):
"Writes a to filename"
f = open(filename,"w")
f.write(a)
f.close()
def help():
"Print out info"
print "\n bibharvest -fhimoprsuv baseURL\n"
print " -h print this help"
print " -V print version number"
print " -o<outputfilename> specify output file"
print " -v<verb> OAI verb to be executed"
print " -m<method> http method (default POST)"
print " -p<metadataPrefix> metadata format"
print " -i<identifier> OAI identifier"
print " -s<set> OAI set"
print " -r<resuptionToken> Resume previous harvest"
print " -f<from> from date (datestamp)"
print " -u<until> until date (datestamp)\n"
def OAI_Request(server, script, params, method="POST"):
"Handle OAi request"
</protect>
headers = {"Content-type":"application/x-www-form-urlencoded", "Accept":"text/xml", "From":"<ADMINEMAIL>", "User-Agent":"CDSware <VERSION>"}
<protect>
i = 0
while i < 10:
i = i + 1
conn = httplib.HTTPConnection(server)
if method == "GET":
</protect>
conn.putrequest(method,script + "?" + params)
conn.putheader("Content-type","application/x-www-form-urlencoded")
conn.putheader("Accept","text/xml")
conn.putheader("From","<ADMINEMAIL>")
conn.putheader("User-Agent","<CDSNAME>")
conn.endheaders()
<protect>
elif method == "POST":
conn.request("POST", script, params, headers)
response = conn.getresponse()
status = "%d" % response.status
if http_response_status_code.has_key(status):
sys.stderr.write("%s(%s) : %s : %s\n" % (status, http_response_status_code[status], response.reason, params))
else:
sys.stderr.write("%s(%s) : %s : %s\n" % (status, http_response_status_code['000'], response.reason, params))
if response.status == 200:
i = 10
data = response.read()
conn.close()
return data
elif response.status == 503:
sys.stderr.write("Retry in %d seconds...\n" % string.atoi(response.getheader("Retry-After","%d" % (i*i))))
time.sleep(string.atoi(response.getheader("Retry-After","%d" % (i*i))))
elif response.status == 302:
sys.stderr.write("Redirecting...\n")
server = response.getheader("Location").split("/")[2]
script = "/" + string.join(response.getheader("Location").split("/")[3:],"/")
else:
sys.stderr.write("Retry in 10 seconds...\n")
time.sleep(10)
sys.stderr.write("Harvesting interrupted (after 10 attempts) at %s: %s\n" % (time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())),params)
sys.exit(1)
def main():
"Main"
try:
opts, args = getopt.getopt(sys.argv[1:],"hVo:v:m:p:i:s:f:u:r:x:",
[
"help",
"version",
"output",
"verb",
"method",
"metadataPrefix",
"identifier",
"set",
"from",
"until",
"resumptionToken"
]
)
except getopt.error:
help()
sys.exit(1)
http_param_dict = {}
method = "POST"
output = ""
stylesheet = ""
# get options and arguments
for opt, opt_value in opts:
if opt == "-v":
http_param_dict['verb'] = opt_value
elif opt == "-m":
if opt_value == "GET" or opt_value == "POST":
method = opt_value
elif opt == "-p":
http_param_dict['metadataPrefix'] = opt_value
elif opt == "-i":
http_param_dict['identifier'] = opt_value
elif opt == "-s":
http_param_dict['set'] = opt_value
elif opt == "-f":
http_param_dict['from'] = opt_value
elif opt == "-u":
http_param_dict['until'] = opt_value
elif opt == "-r":
http_param_dict['resumptionToken'] = opt_value
elif opt == "-o":
output = opt_value
elif opt == "-x":
stylesheet = opt_value
elif opt in ["-V", "--version"]:
print __version__
sys.exit(0)
else:
help()
sys.exit()
if len(args) > 0:
server = args[0].split("/")[2]
script = "/" + string.join(args[0].split("/")[3:],"/")
OAI_Session(server, script, http_param_dict, method, output, stylesheet)
sys.stderr.write("Harvesting successfully completed at: %s\n\n" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
else:
help()
sys.exit()
if __name__ == '__main__':
main()
</protect>

Event Timeline