Page MenuHomec4science

bibformat_bfx_engine.py
No OneTemporary

File Metadata

Created
Fri, May 10, 09:17

bibformat_bfx_engine.py

## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BFX formatting engine.
For API: see format_with_bfx() docstring below.
"""
__revision__ = "$Id$"
import re
from xml.dom import minidom, Node
from xml.sax import saxutils
from invenio.bibformat_engine import BibFormatObject, get_format_element, eval_format_element
from invenio.bibformat_bfx_engine_config import CFG_BIBFORMAT_BFX_LABEL_DEFINITIONS, CFG_BIBFORMAT_BFX_TEMPLATES_PATH
from invenio.bibformat_bfx_engine_config import CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION, CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
from invenio.bibformat_bfx_engine_config import CFG_BIBFORMAT_BFX_ERROR_MESSAGES, CFG_BIBFORMAT_BFX_WARNING_MESSAGES
address_pattern = r'(?P<parent>[a-z_]*):?/?(?P<tag>[0-9_?\w]*)/?(?P<code>[\w_?]?)#?(?P<reg>.*)'
def format_with_bfx(recIDs, out_file, template_name, preprocess=None):
'''
Format a set of records according to a BFX template.
This is the main entry point to the BFX engine.
@param recIDs a list of record IDs to format
@param out_file an object to write in; this can be every object which has a 'write' method: file, req, StringIO
@param template_name the file name of the BFX template without the path and the .bfx extension
@param preprocess an optional function; every record is passed through this function for initial preprocessing before formatting
'''
trans = MARCTranslator(CFG_BIBFORMAT_BFX_LABEL_DEFINITIONS)
trans.set_record_ids(recIDs, preprocess)
parser = BFXParser(trans)
template_tree = parser.load_template(template_name)
parser.walk(template_tree, out_file)
return None
class BFXParser:
'''
A general-purpose parser for generating xml/xhtml/text output based on a template system.
Must be initialised with a translator. A translator is like a blackbox that returns values, calls functions, etc...
Works with every translator supporting the following simple interface:
- is_defined(name)
- get_value(name)
- iterator(name)
- call_function(func_name, list_of_parameters)
Customized for MARC to XML conversion through the use of a MARCTranslator.
Templates are strict XML files. They are built by combining any tags with the
special BFX tags living in the http://cdsware.cern.ch/invenio/ namespace.
Easily extensible by tags of your own.
Defined tags:
- template: defines a template
- template_ref: a reference to a template
- loop structure
- if, then, elif, else structure
- text: output text
- field: query translator for field 'name'
- element: call external functions
'''
def __init__(self, translator):
'''
Create an instance of the BFXParser class. Initialize with a translator.
The BFXparser makes queries to the translator for the values of certain names.
For the communication it uses the following translator methods:
- is_defined(name)
- iterator(name)
- get_value(name, [display_specifier])
@param translator the translator used by the class instance
'''
self.translator = translator
self.known_operators = ['style', 'format', 'template', 'template_ref', 'text', 'field', 'element', 'loop', 'if', 'then', 'else', 'elif']
self.flags = {} # store flags here;
self.templates = {} # store templates and formats here
self.start_template_name = None #the name of the template from which the 'execution' starts;
#this is usually a format or the only template found in a doc
def load_template(self, template_name):
'''
Load a BFX template file.
A template file can have one of two forms:
- it is a file with a single template. Root tag is 'template'.
In an API call the single template element is 'executed'.
- it is a 'style' file which contains exactly one format and zero or more templates. Root tag is 'style' with children 'format' and 'template'(s).
In this case only the format code is 'executed'. Naturally, in it, it would have references to other templates in the document.
@param template_name the name of the BFX template, the same as the name of the filename without the extension
@return a DOM tree of the template
'''
template_file_name = CFG_BIBFORMAT_BFX_TEMPLATES_PATH + '/' + template_name + '.' + CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION
#load document
doc = minidom.parse(template_file_name)
#set exec flag to false and walk document to find templates and formats
self.flags['exec'] = False
self.walk(doc)
#check found templates
if self.start_template_name:
start_template = self.templates[self.start_template_name]['node']
else:
print CFG_BIBFORMAT_BFX_WARNING_MESSAGES['WRN_BFX_NO_FORMAT_FOUND']
if len(self.templates) == 1:
# no format found, check if there is a default template
self.start_template_name = self.templates.keys()[0]
start_template = self.templates[self.start_template_name]['node']
else:
#no formats found, templates either zero or more than one
if len(self.templates) > 1:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TOO_MANY_TEMPLATES']
else:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_TEMPLATES_FOUND']
return None
self.flags['exec'] = True
return start_template
def parse_attribute(self, expression):
'''
A function to check if an expression is of the special form [!name:display].
A short form for saying <bx:field name="name" display="tag">, used in element attributes.
@param expression a string, usually taken from an attribute value
@return if the string is special, parse it and return the corresponding value; else return the initial expression
'''
output = expression
pattern = '\[!(?P<tmp>[\w_.:]*)\]'
expr = re.compile(pattern)
match = expr.match(expression)
if match:
tmp = match.group('tmp')
tmp = tmp.split(':')
var = tmp[0]
display = ''
if len(tmp) == 2:
display = tmp[1]
output = self.translator.get_value(var, display)
output = xml_escape(output)
return output
def walk(self, parent, out_file=None):
'''
Walk a template DOM tree.
The main function in the parser. It is recursively called until all the nodes are processed.
This function is used in two different ways:
- for initial loading of the template (and validation)
- for 'execution' of a format/template
The different behaviour is achieved through the use of flags, which can be set to True or False.
@param parent a node to process; in an API call this is the root node
@param out_file an object to write to; must have a 'write' method
@return None
'''
for node in parent.childNodes:
if node.nodeType == Node.TEXT_NODE:
value = get_node_value(node)
value = value.strip()
if out_file:
out_file.write(value)
if node.nodeType == Node.ELEMENT_NODE:
#get values
name, attributes, element_namespace = get_node_name(node), get_node_attributes(node), get_node_namespace(node)
# write values
if element_namespace != CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE:
#parse all the attributes
for key in attributes.keys():
attributes[key] = self.parse_attribute(attributes[key])
if node_has_subelements(node):
if out_file:
out_file.write(create_xml_element(name=name, attrs=attributes, element_type=xmlopen))
self.walk(node, out_file) #walk subnodes
if out_file:
out_file.write(create_xml_element(name=name, element_type=xmlclose))
else:
if out_file:
out_file.write(create_xml_element(name=name, attrs=attributes, element_type=xmlempty))
#name is a special name, must fall in one of the next cases:
elif node.localName == 'style':
self.ctl_style(node, out_file)
elif node.localName == 'format':
self.ctl_format(node, out_file)
elif node.localName == 'template':
self.ctl_template(node, out_file)
elif node.localName == 'template_ref':
self.ctl_template_ref(node, out_file)
elif node.localName == 'element':
self.ctl_element(node, out_file)
elif node.localName == 'field':
self.ctl_field(node, out_file)
elif node.localName == 'text':
self.ctl_text(node, out_file)
elif node.localName == 'loop':
self.ctl_loop(node, out_file)
elif node.localName == 'if':
self.ctl_if(node, out_file)
elif node.localName == 'then':
self.ctl_then(node, out_file)
elif node.localName == 'else':
self.ctl_else(node, out_file)
elif node.localName == 'elif':
self.ctl_elif(node, out_file)
else:
if node.localName in self.known_operators:
print 'Note for programmer: you haven\'t implemented operator %s.' % (name)
else:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_INVALID_OPERATOR_NAME'] % (name)
return None
def ctl_style(self, node, out_file):
'''
Process a style root node.
'''
#exec mode
if self.flags['exec']:
return None
#test mode
self.walk(node, out_file)
return None
def ctl_format(self, node, out_file):
'''
Process a format node.
Get name, description and content attributes.
This function is called only in test mode.
'''
#exec mode
if self.flags['exec']:
return None
#test mode
attrs = get_node_attributes(node)
#get template name and give control to ctl_template
if attrs.has_key('name'):
name = attrs['name']
if self.templates.has_key(name):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_DUPLICATE_NAME'] % (name)
return None
self.start_template_name = name
self.ctl_template(node, out_file)
else:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEMPLATE_NO_NAME']
return None
return None
def ctl_template(self, node, out_file):
'''
Process a template node.
Get name, description and content attributes.
Register name and store for later calls from template_ref.
This function is called only in test mode.
'''
#exec mode
if self.flags['exec']:
return None
#test mode
attrs = get_node_attributes(node)
#get template name
if attrs.has_key('name'):
name = attrs['name']
if self.templates.has_key(name):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_DUPLICATE_NAME'] % (name)
return None
self.templates[name] = {}
self.templates[name]['node'] = node
else:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEMPLATE_NO_NAME']
return None
#get template description
if attrs.has_key('description'):
description = attrs['description']
else:
description = ''
print CFG_BIBFORMAT_BFX_WARNING_MESSAGES['WRN_BFX_TEMPLATE_NO_DESCRIPTION']
self.templates[name]['description'] = description
#get content-type of resulting output
if attrs.has_key('content'):
content_type = attrs['content']
else:
content_type = 'text/xml'
print CFG_BIBFORMAT_BFX_WARNING_MESSAGES['WRN_BFX_TEMPLATE_NO_CONTENT']
self.templates[name]['content_type'] = content_type
#walk node
self.walk(node, out_file)
return None
def ctl_template_ref(self, node, out_file):
'''
Reference to an external template.
This function is called only in execution mode. Bad references appear as run-time errors.
'''
#test mode
if not self.flags['exec']:
return None
#exec mode
attrs = get_node_attributes(node)
if not attrs.has_key('name'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEMPLATE_REF_NO_NAME']
return None
name = attrs['name']
#first check for a template in the same file, that is in the already cached templates
if self.templates.has_key(name):
node_to_walk = self.templates[name]['node']
self.walk(node_to_walk, out_file)
else:
#load a file and execute it
pass
#template_file_name = CFG_BIBFORMAT_BFX_TEMPLATES_PATH + name + '/' + CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION
#try:
# node = minidom.parse(template_file_name)
#except:
# print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEMPLATE_NOT_FOUND'] % (template_file_name)
return None
def ctl_element(self, node, out_file):
'''
Call an external element (written in Python).
'''
#test mode
if not self.flags['exec']:
return None
#exec mode
parameters = get_node_attributes(node)
if not parameters.has_key('name'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_ELEMENT_NO_NAME']
return None
function_name = parameters['name']
del parameters['name']
#now run external bfe_name.py, with param attrs
if function_name:
value = self.translator.call_function(function_name, parameters)
value = xml_escape(value)
out_file.write(value)
return None
def ctl_field(self, node, out_file):
'''
Get the value of a field by its name.
'''
#test mode
if not self.flags['exec']:
return None
#exec mode
attrs = get_node_attributes(node)
if not attrs.has_key('name'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_FIELD_NO_NAME']
return None
display = ''
if attrs.has_key('display'):
display = attrs['display']
var = attrs['name']
if not self.translator.is_defined(var):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_SUCH_FIELD'] % (var)
return None
value = self.translator.get_value(var, display)
value = xml_escape(value)
out_file.write(value)
return None
def ctl_text(self, node, out_file):
'''
Output a text
'''
#test mode
if not self.flags['exec']:
return None
#exec mode
attrs = get_node_attributes(node)
if not attrs.has_key('value'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEXT_NO_VALUE']
return None
value = attrs['value']
value = value.replace(r'\n', '\n')
#value = xml_escape(value)
if type(value) == type(u''):
value = value.encode('utf-8')
out_file.write(value)
return None
def ctl_loop(self, node, out_file):
'''
Loop through a set of values.
'''
#test mode
if not self.flags['exec']:
self.walk(node, out_file)
return None
#exec mode
attrs = get_node_attributes(node)
if not attrs.has_key('object'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_LOOP_NO_OBJECT']
return None
name = attrs['object']
if not self.translator.is_defined(name):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_SUCH_FIELD'] % (name)
return None
for new_object in self.translator.iterator(name):
self.walk(node, out_file)
return None
def ctl_if(self, node, out_file):
'''
An if/then/elif/.../elif/else construct.
'If' can have several forms:
<if name="var"/> : True if var is non-empty, eval as string
<if name="var" eq="value"/> : True if var=value, eval as string
<if name="var" lt="value"/> : True if var<value, try to eval as num, else eval as string
<if name="var" gt="value"/> : True if var>value, try to eval as num, else eval as string
<if name="var" le="value"/> : True if var<=value, try to eval as num, else eval as string
<if name="var" ge="value"/> : True if var>=value, try to eval as num, else eval as string
<if name="var" in="val1 val2"/> : True if var in [val1, val2], eval as string
<if name="var" nin="val1 val2"/> : True if var not in [val1, val2], eval as string
<if name="var" neq="value"/> : True if var!=value, eval as string
<if name="var" like="regexp"/> : Match against a regular expression
Example:
<if name="author" eq="Pauli">
<then>Pauli</then>
<elif name="" eq="Einstein">
<then>Pauli</then>
<else>other</else>
</elif>
</if>
'''
#test mode
if not self.flags['exec']:
self.walk(node, out_file)
return None
#exec mode
attrs = get_node_attributes(node)
if not attrs.has_key('name'):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_IF_NO_NAME']
return None
#determine result
var = attrs['name']
if not self.translator.is_defined(var):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_SUCH_FIELD'] % (var)
return None
value = self.translator.get_value(var)
value = value.strip()
#equal
if attrs.has_key('eq'):
pattern = attrs['eq']
if is_number(pattern) and is_number(value):
result = (float(value)==float(pattern))
else:
result = (value==pattern)
#not equal
elif attrs.has_key('neq'):
pattern = attrs['neq']
if is_number(pattern) and is_number(value):
result = (float(value)!=float(pattern))
else:
result = (value!=pattern)
#lower than
elif attrs.has_key('lt'):
pattern = attrs['lt']
if is_number(pattern) and is_number(value):
result = (float(value)<float(pattern))
else:
result = (value<pattern)
#greater than
elif attrs.has_key('gt'):
pattern = attrs['gt']
if is_number(pattern) and is_number(value):
result = (float(value)>float(pattern))
else:
result = (value>pattern)
#lower or equal than
elif attrs.has_key('le'):
pattern = attrs['le']
if is_number(pattern) and is_number(value):
result = (float(value)<=float(pattern))
else:
result = (value<=pattern)
#greater or equal than
elif attrs.has_key('ge'):
pattern = attrs['ge']
if is_number(pattern) and is_number(value):
result = (float(value)>=float(pattern))
else:
result = (value>=pattern)
#in
elif attrs.has_key('in'):
pattern = attrs['in']
values = pattern.split()
result = (value in values)
#not in
elif attrs.has_key('nin'):
pattern = attrs['nin']
values = pattern.split()
result = (value not in values)
#match against a regular expression
elif attrs.has_key('like'):
pattern = attrs['like']
try:
expr = re.compile(pattern)
result = expr.match(value)
except:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_INVALID_RE'] % (pattern)
#simple form: True if non-empty, otherwise False
else:
result = value
#end of evaluation
#=================
#validate subnodes
then_node = get_node_subelement(node, 'then', CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE)
else_node = get_node_subelement(node, 'else', CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE)
elif_node = get_node_subelement(node, 'elif', CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE)
#having else and elif siblings at the same time is a syntax error
if (else_node is not None) and (elif_node is not None):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_IF_WRONG_SYNTAX']
return None
#now walk appropriate nodes, according to the result
if result: #True
if then_node:
self.walk(then_node, out_file)
#todo: add short form, without 'then', just elements within if statement to walk on 'true' and no 'elif' or 'else' elements
else: #False
if elif_node:
self.ctl_if(elif_node, out_file)
elif else_node:
self.walk(else_node, out_file)
return None
def ctl_then(self, node, out_file):
'''
Calling 'then' directly from the walk function means a syntax error.
'''
#test mode
if not self.flags['exec']:
self.walk(node, out_file)
return None
#exec mode
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_IF_WRONG_SYNTAX']
return None
def ctl_else(self, node, out_file):
'''
Calling 'else' directly from the walk function means a syntax error.
'''
#test mode
if not self.flags['exec']:
self.walk(node, out_file)
return None
#exec mode
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_IF_WRONG_SYNTAX']
return None
def ctl_elif(self, node, out_file):
'''
Calling 'elif' directly from the walk function means a syntax error.
'''
#test mode
if not self.flags['exec']:
self.walk(node, out_file)
return None
#exec mode
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_IF_WRONG_SYNTAX']
return None
class MARCTranslator:
'''
memory[name]
[name]['addresses'] - the set of rules for each of the defined names
[name]['parent'] - the name of the parent; '' if none;
[name]['children'] - a list with the name of the children of every variable
[name]['object'] - stored state of object for performance efficiency
'''
def __init__(self, labels=None):
'''
Create an instance of the translator and init with the list of the defined labels and their rules.
'''
if labels is None:
labels = {}
self.recIDs = []
self.recID = 0
self.recID_index = 0
self.record = None
self.memory = {}
pattern = address_pattern
expr = re.compile(pattern)
for name in labels.keys():
self.memory[name] = {}
self.memory[name]['object'] = None
self.memory[name]['parent'] = ''
self.memory[name]['children'] = []
self.memory[name]['addresses'] = labels[name]
for name in self.memory:
for i in range(len(self.memory[name]['addresses'])):
address = self.memory[name]['addresses'][i]
match = expr.match(address)
if not match:
print 'Invalid address: ', name, address
else:
parent_name = match.group('parent')
if parent_name:
if not self.memory.has_key(parent_name):
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_SUCH_FIELD'] % (parent_name)
else:
self.memory[name]['parent'] = parent_name
#now make parent aware of children
if not name in self.memory[parent_name]['children']:
self.memory[parent_name]['children'].append(name)
level = self.determine_level(parent_name)
self.memory[name]['addresses'][i] = self.memory[name]['addresses'][i].replace(parent_name, '/'*level)
#special case 'record'
self.memory['record'] = {}
self.memory['record']['object'] = None
self.memory['record']['parent'] = ''
self.memory['record']['children'] = []
def set_record_ids(self, recIDs, preprocess=None):
'''
Initialize the translator with the set of record IDs.
@param recIDs a list of the record IDs
@param preprocess an optional function which acts on every record structure after creating it
This can be used to enrich the record with fields not present in the record initially,
verify the record data or whatever plausible.
Another solution is to use external function elements.
'''
self.record = None
self.recIDs = recIDs
self.preprocess = preprocess
if self.recIDs:
self.recID_index = 0
self.recID = self.recIDs[self.recID_index]
self.record = get_record(self.recID)
if self.preprocess:
self.preprocess(self.record)
return None
def determine_level(self, name):
'''
Determine the type of the variable, whether this is an instance or a subfield.
This is done by observing the first provided address for the name.
todo: define variable types in config file, remove this function, results in a clearer concept
'''
level = 0 #default value
if self.memory.has_key(name):
expr = re.compile(address_pattern)
if self.memory[name]['addresses']:
match = expr.match(self.memory[name]['addresses'][0])
if match:
tag = match.group('tag')
code = match.group('code')
reg = match.group('reg')
if reg:
level = 2 #subfield
elif code:
level = 2 #subfield
elif tag:
level = 1 #instance
return level
#========================================
#API functions for quering the translator
#========================================
def is_defined(self, name):
'''
Check whether a variable is defined.
@param name the name of the variable
'''
return self.memory.has_key(name)
def get_num_elements(self, name):
'''
An API function to get the number of elements for a variable.
Don't use this function to build loops, Use iterator instead.
'''
if name == 'record':
return len(self.recIDs)
num = 0
for part in self.iterator(name):
num = num + 1
return num
def get_value(self, name, display_type='value'):
'''
The API function for quering the translator for values of a certain variable.
Called in a loop will result in a different value each time.
Objects are cached in memory, so subsequent calls for the same variable take less time.
@param name the name of the variable you want the value of
@param display_type an optional value for the type of the desired output, one of: value, tag, ind1, ind2, code, fulltag;
These can be easily added in the proper place of the code (display_value)
'''
if name == 'record':
return ''
record = self.get_object(name)
return self.display_record(record, display_type)
def iterator(self, name):
'''
An iterator over the values of a certain name.
The iterator changes state of internal variables and objects.
When calling get_value in a loop, this will result each time in a different value.
'''
if name == 'record':
for self.recID in self.recIDs:
self.record = get_record(self.recID)
if self.preprocess:
self.preprocess(self.record)
yield str(self.recID)
else:
full_object = self.build_object(name)
level = self.determine_level(name)
for new_object in record_parts(full_object, level):
self.memory[name]['object'] = new_object
#parent has changed state; also set childs state to None;
for children_name in self.memory[name]['children']:
self.memory[children_name]['object'] = None
yield new_object
#the result for a call of the same name after an iterator should be the same as if there was no iterator called before
self.memory[name]['object'] = None
def call_function(self, function_name, parameters=None):
'''
Call an external element which is a Python file, using BibFormat
@param function_name the name of the function to call
@param parameters a dictionary of the parameters to pass as key=value pairs
@return a string value, which is the result of the function call
'''
if parameters is None:
parameters = {}
bfo = BibFormatObject(self.recID)
format_element = get_format_element(function_name)
(value, errors) = eval_format_element(format_element, bfo, parameters)
#to do: check errors from function call
return value
#========================================
#end of API functions
#========================================
def get_object(self, name):
'''
Responsible for creating the desired object, corresponding to provided name.
If object is not cached in memory, it is build again.
Directly called by API function get_value.
The result is then formatted by display_record according to display_type.
'''
if self.memory[name]['object'] is not None:
return self.memory[name]['object']
new_object = self.build_object(name)
#if you have reached here you are not in an iterator; return first non-empty
level = self.determine_level(name)
for tmp_object in record_parts(new_object, level):
#get the first non-empty
if tmp_object:
new_object = tmp_object
break
self.memory[name]['object'] = new_object
return new_object
def build_object(self, name):
'''
Build the object from the list of addresses
A slave function for get_object.
'''
new_object = {}
parent_name = self.memory[name]['parent'];
has_parent = parent_name
for address in self.memory[name]['addresses']:
if not has_parent:
tmp_object = copy(self.record, address)
new_object = merge(new_object, tmp_object)
else: #has parent
parent_object = self.get_object(parent_name) #already returns the parents instance
tmp_object = copy(parent_object, address)
new_object = merge(new_object, tmp_object)
return new_object
def display_record(self, record, display_type='value'):
'''
Decide what the final output value is according to the display_type.
@param record the record structure to display; this is most probably just a single subfield
@param display_type a string specifying the desired output; can be one of: value, tag, ind1, ind2, code, fulltag
@return a string to output
'''
output = ''
tag, ind1, ind2, code, value = '', '', '', '', ''
if record:
tags = record.keys()
tags.sort()
if tags:
fulltag = tags[0]
tag, ind1, ind2 = fulltag[0:3], fulltag[3:4], fulltag[4:5]
field_instances = record[fulltag]
if field_instances:
field_instance = field_instances[0]
codes = field_instance.keys()
codes.sort()
if codes:
code = codes[0]
value = field_instance[code]
if not display_type:
display_type = 'value'
if display_type == 'value':
output = value
elif display_type == 'tag':
output = tag
elif display_type == 'ind1':
ind1 = ind1.replace('_', '')
output = ind1
elif display_type=='ind2':
ind2 = ind2.replace('_', '')
output = ind2
elif display_type == 'code':
output = code
elif display_type == 'fulltag':
output = tag + ind1 + ind2
else:
print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_INVALID_DISPLAY_TYPE'] % (display_type)
return output
'''
Functions for use with the structure representing a MARC record defined here.
This record structure differs from the one defined in bibrecord.
The reason is that we want a symmetry between controlfields and datafields.
In this format controlfields are represented internally as a subfield value with code ' ' of a datafield.
This allows for easier handling of the fields.
However, there is a restriction associated with this structure and it is that subfields cannot be repeated
in the same instance. If this is the case, the result will be incorrect.
The record structure has the form:
fields={field_tag:field_instances}
field_instances=[field_instance]
field_instance={field_code:field_value}
'''
def convert_record(old_record):
'''
Convert a record from the format defined in bibrecord to the format defined here
@param old_record the record as returned from bibrecord.create_record()
@return a record of the new form
'''
fields = {}
old_tags = old_record.keys()
old_tags.sort()
for old_tag in old_tags:
if int(old_tag) < 11:
#controlfields
new_tag = old_tag
fields[new_tag] = [{' ':old_record[old_tag][0][3]}]
else:
#datafields
old_field_instances = old_record[old_tag]
num_fields = len(old_field_instances)
for i in range(num_fields):
old_field_instance = old_field_instances[i]
ind1 = old_field_instance[1]
if not ind1:
ind1 = '_'
ind2 = old_field_instance[2]
if not ind2:
ind2 = '_'
new_tag = old_tag + ind1 + ind2
new_field_instance = {}
for old_subfield in old_field_instance[0]:
new_code = old_subfield[0]
new_value = old_subfield[1]
if new_field_instance.has_key(new_code):
print 'Error: Repeating subfield codes in the same instance!'
new_field_instance[new_code] = new_value
if not fields.has_key(new_tag):
fields[new_tag] = []
fields[new_tag].append(new_field_instance)
return fields
def get_record(recID):
'''
Get a record with a specific recID.
@param recID the ID of the record
@return a record in the structure defined here
'''
bfo = BibFormatObject(recID)
return convert_record(bfo.get_record())
def print_record(record):
'''
Print a record.
'''
tags = record.keys()
tags.sort()
for tag in tags:
field_instances = record[tag]
for field_instance in field_instances:
print tag, field_instance
def record_fields_value(record, tag, subfield):
'''
Return a list of all the fields with a certain tag and subfield code.
Works on subfield level.
@param record a record
@param tag a 3 or 5 letter tag; required
@param subfield a subfield code; required
'''
output = []
if record.has_key(tag):
for field_instance in record[tag]:
if field_instance.has_key(subfield):
output.append(field_instance[subfield])
return output
def record_add_field_instance(record, tag, field_instance):
'''
Add a field_instance to the beginning of the instances of a corresponding tag.
@param record a record
@param tag a 3 or 5 letter tag; required
@param field_instance the field instance to add
@return None
'''
if not record.has_key(tag):
record[tag] = []
record[tag] = [field_instance] + record[tag]
return None
def record_num_parts(record, level):
'''
Count the number of instances or the number of subfields in the whole record.
@param record
@param level either 1 or 2
level=1 - view record on instance level
level=2 - view record on subfield level
@return the number of parts
'''
num = 0
for part in record_parts(record, level):
num = num + 1
def record_parts(record, level):
'''
An iterator over the instances or subfields of a record.
@param record
@param level either 1 or 2
level=1 - iterate over instances
level=2 - iterate over subfields
@yield a record structure representing the part (instance or subfield)
'''
if level == 1:
names = record.keys()
names.sort()
for name in names:
old_field_instances = record[name]
for old_field_instance in old_field_instances:
new_record = {}
new_field_instances = []
new_field_instance = {}
for old_field_code in old_field_instance.keys():
new_field_code = old_field_code
new_field_value = old_field_instance[old_field_code]
new_field_instance[new_field_code] = new_field_value
new_field_instances.append(new_field_instance)
new_record[name] = []
new_record[name].extend(new_field_instances)
yield new_record
if level == 2:
names = record.keys()
names.sort()
for name in names:
old_field_instances = record[name]
for old_field_instance in old_field_instances:
old_field_codes = old_field_instance.keys()
old_field_codes.sort()
for old_field_code in old_field_codes:
new_record = {}
new_field_instances = []
new_field_instance = {}
new_field_code = old_field_code
new_field_value = old_field_instance[old_field_code]
new_field_instance[new_field_code] = new_field_value
new_field_instances.append(new_field_instance)
new_record[name] = []
new_record[name].extend(new_field_instances)
yield new_record
def copy(old_record, address=''):
'''
Copy a record by filtering all parts of the old record specified by address
(A better name for the function is filter.)
@param record the initial record
@param address an address; for examples see bibformat_bfx_engine_config.
If no address is specified, return the initial record.
@return the filtered record
'''
if not old_record:
return {}
tag_pattern, code_pattern, reg_pattern = '', '', ''
expr = re.compile(address_pattern)
match = expr.match(address)
if match:
tag_pattern = match.group('tag')
code_pattern = match.group('code')
reg_pattern = match.group('reg')
if tag_pattern:
tag_pattern = tag_pattern.replace('?','[0-9_\w]')
else:
tag_pattern = r'.*'
if code_pattern:
code_pattern = code_pattern.replace('?','[\w ]')
else:
code_pattern = r'.*'
tag_expr = re.compile(tag_pattern)
code_expr = re.compile(code_pattern)
new_record = {}
for tag in old_record.keys():
tag_match = tag_expr.match(tag)
if tag_match:
if tag_match.end() == len(tag):
old_field_instances = old_record[tag]
new_field_instances = []
for old_field_instance in old_field_instances:
new_field_instance = {}
for old_field_code in old_field_instance.keys():
new_field_code = old_field_code
code_match = code_expr.match(new_field_code)
if code_match:
new_field_value = old_field_instance[old_field_code]
new_field_instance[new_field_code] = new_field_value
if new_field_instance:
new_field_instances.append(new_field_instance)
if new_field_instances:
new_record[tag] = new_field_instances
#in new_record pass all subfields through regexp
if reg_pattern:
for tag in new_record:
field_instances = new_record[tag]
for field_instance in field_instances:
field_codes = field_instance.keys()
for field_code in field_codes:
field_instance[field_code] = pass_through_regexp(field_instance[field_code], reg_pattern)
return new_record
def merge(record1, record2):
'''
Merge two records.
Controlfields with the same tag in record2 as in record1 are ignored.
@param record1, record2
@return the merged record
'''
new_record = {}
if record1:
new_record = copy(record1)
if not record2:
return new_record
for tag in record2.keys():
#append only datafield tags;
#if controlfields conflict, leave first;
old_field_instances = record2[tag]
new_field_instances = []
for old_field_instance in old_field_instances:
new_field_instance = {}
for old_field_code in old_field_instance.keys():
new_field_code = old_field_code
new_field_value = old_field_instance[old_field_code]
new_field_instance[new_field_code] = new_field_value
if new_field_instance:
new_field_instances.append(new_field_instance)
if new_field_instances:
#controlfield
if len(tag) == 3:
if not new_record.has_key(tag):
new_record[tag] = []
new_record[tag].extend(new_field_instances)
#datafield
if len(tag) == 5:
if not new_record.has_key(tag):
new_record[tag] = []
new_record[tag].extend(new_field_instances)
return new_record
#======================
#Help functions
#=====================
xmlopen = 1
xmlclose = 2
xmlfull = 3
xmlempty = 4
def create_xml_element(name, value='', attrs=None, element_type=xmlfull, level=0):
'''
Create a XML element as string.
@param name the name of the element
@param value the element value; default is ''
@param attrs a dictionary with the element attributes
@param element_type a constant which defines the type of the output
xmlopen = 1 <element attr="attr_value">
xmlclose = 2 </element>
xmlfull = 3 <element attr="attr_value">value</element>
xmlempty = 4 <element attr="attr_value"/>
@return a formatted XML string
'''
output = ''
if attrs is None:
attrs = {}
if element_type == xmlempty:
output += '<'+name
for attrname in attrs.keys():
attrvalue = attrs[attrname]
if type(attrvalue) == type(u''):
attrvalue = attrvalue.encode('utf-8')
output += ' %s="%s"' % (attrname, attrvalue)
output += ' />'
if element_type == xmlfull:
output += '<'+name
for attrname in attrs.keys():
attrvalue = attrs[attrname]
if type(attrvalue) == type(u''):
attrvalue = attrvalue.encode('utf-8')
output += ' %s="%s"' % (attrname, attrvalue)
output += '>'
output += value
output += '</'+name+'>'
if element_type == xmlopen:
output += '<'+name
for attrname in attrs.keys():
output += ' '+attrname+'="'+attrs[attrname]+'"'
output += '>'
if element_type == xmlclose:
output += '</'+name+'>'
output = ' '*level + output
if type(output) == type(u''):
output = output.encode('utf-8')
return output
def xml_escape(value):
'''
Escape a string value for use as a xml element or attribute value.
@param value the string value to escape
@return escaped value
'''
return saxutils.escape(value)
def xml_unescape(value):
'''
Unescape a string value for use as a xml element.
@param value the string value to unescape
@return unescaped value
'''
return saxutils.unescape(value)
def node_has_subelements(node):
'''
Check if a node has any childnodes.
Check for element or text nodes.
@return True if childnodes exist, False otherwise.
'''
result = False
for node in node.childNodes:
if node.nodeType == Node.ELEMENT_NODE or node.nodeType == Node.TEXT_NODE:
result = True
return result
def get_node_subelement(parent_node, name, namespace = None):
'''
Get the first childnode with specific name and (optional) namespace
@param parent_node the node to check
@param name the name to search
@param namespace An optional namespace URI. This is usually a URL: http://cdsware.cern.ch/invenio/
@return the found node; None otherwise
'''
output = None
for node in parent_node.childNodes:
if node.nodeType == Node.ELEMENT_NODE and node.localName == name and node.namespaceURI == namespace:
output = node
return output
return output
def get_node_value(node):
'''
Get the node value of a node. For use with text nodes.
@param node a text node
@return a string of the nodevalue encoded in utf-8
'''
return node.nodeValue.encode('utf-8')
def get_node_namespace(node):
'''
Get node namespace. For use with element nodes.
@param node an element node
@return the namespace of the node
'''
return node.namespaceURI
def get_node_name(node):
'''
Get the node value of a node. For use with element nodes.
@param node an element node
@return a string of the node name
'''
return node.nodeName
def get_node_attributes(node):
'''
Get attributes of an element node. For use with element nodes
@param node an element node
@return a dictionary of the attributes as key:value pairs
'''
attributes = {}
attrs = node.attributes
for attrname in attrs.keys():
attrnode = attrs.get(attrname)
attrvalue = attrnode.nodeValue
attributes[attrname] = attrvalue
return attributes
def pass_through_regexp(value, regexp):
'''
Pass a value through a regular expression.
@param value a string
@param regexp a regexp with a group 'value' in it. No group named 'value' will result in an error.
@return if the string matches the regexp, return named group 'value', otherwise return ''
'''
output = ''
expr = re.compile(regexp)
match = expr.match(value)
if match:
output = match.group('value')
return output
def is_number(value):
'''
Check if a value is a number.
@param value the value to check
@return True or False
'''
result = True
try:
float(value)
except ValueError:
result = False
return result

Event Timeline