Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91355736
bibformat_bfx_engine.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Nov 10, 07:11
Size
49 KB
Mime Type
text/x-python
Expires
Tue, Nov 12, 07:11 (2 d)
Engine
blob
Format
Raw Data
Handle
22229992
Attached To
R3600 invenio-infoscience
bibformat_bfx_engine.py
View Options
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
BFX formatting engine.
For API: see format_with_bfx() docstring below.
"""
__revision__
=
"$Id$"
import
re
import
copy
as
p_copy
from
xml.dom
import
minidom
,
Node
from
xml.sax
import
saxutils
from
invenio.bibformat_engine
import
BibFormatObject
,
get_format_element
,
eval_format_element
from
invenio.bibformat_bfx_engine_config
import
CFG_BIBFORMAT_BFX_LABEL_DEFINITIONS
,
CFG_BIBFORMAT_BFX_TEMPLATES_PATH
from
invenio.bibformat_bfx_engine_config
import
CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION
,
CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
from
invenio.bibformat_bfx_engine_config
import
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
,
CFG_BIBFORMAT_BFX_WARNING_MESSAGES
address_pattern
=
r'(?P<parent>[a-z_]*):?/?(?P<tag>[0-9_?\w]*)/?(?P<code>[\w_?]?)#?(?P<reg>.*)'
def
format_with_bfx
(
recIDs
,
out_file
,
template_name
,
preprocess
=
None
):
'''
Format a set of records according to a BFX template.
This is the main entry point to the BFX engine.
@param recIDs a list of record IDs to format
@param out_file an object to write in; this can be every object which has a 'write' method: file, req, StringIO
@param template_name the file name of the BFX template without the path and the .bfx extension
@param preprocess an optional function; every record is passed through this function for initial preprocessing before formatting
'''
trans
=
MARCTranslator
(
CFG_BIBFORMAT_BFX_LABEL_DEFINITIONS
)
trans
.
set_record_ids
(
recIDs
,
preprocess
)
parser
=
BFXParser
(
trans
)
template_tree
=
parser
.
load_template
(
template_name
)
parser
.
walk
(
template_tree
,
out_file
)
return
None
class
BFXParser
:
'''
A general-purpose parser for generating xml/xhtml/text output based on a template system.
Must be initialised with a translator. A translator is like a blackbox that returns values, calls functions, etc...
Works with every translator supporting the following simple interface:
- is_defined(name)
- get_value(name)
- iterator(name)
- call_function(func_name, list_of_parameters)
Customized for MARC to XML conversion through the use of a MARCTranslator.
Templates are strict XML files. They are built by combining any tags with the
special BFX tags living in the http://cdsware.cern.ch/invenio/ namespace.
Easily extensible by tags of your own.
Defined tags:
- template: defines a template
- template_ref: a reference to a template
- loop structure
- if, then, elif, else structure
- text: output text
- field: query translator for field 'name'
- element: call external functions
'''
def
__init__
(
self
,
translator
):
'''
Create an instance of the BFXParser class. Initialize with a translator.
The BFXparser makes queries to the translator for the values of certain names.
For the communication it uses the following translator methods:
- is_defined(name)
- iterator(name)
- get_value(name, [display_specifier])
@param translator the translator used by the class instance
'''
self
.
translator
=
translator
self
.
known_operators
=
[
'style'
,
'format'
,
'template'
,
'template_ref'
,
'text'
,
'field'
,
'element'
,
'loop'
,
'if'
,
'then'
,
'else'
,
'elif'
]
self
.
flags
=
{}
# store flags here;
self
.
templates
=
{}
# store templates and formats here
self
.
start_template_name
=
None
#the name of the template from which the 'execution' starts;
#this is usually a format or the only template found in a doc
def
load_template
(
self
,
template_name
,
template_source
=
None
):
'''
Load a BFX template file.
A template file can have one of two forms:
- it is a file with a single template. Root tag is 'template'.
In an API call the single template element is 'executed'.
- it is a 'style' file which contains exactly one format and zero or more templates. Root tag is 'style' with children 'format' and 'template'(s).
In this case only the format code is 'executed'. Naturally, in it, it would have references to other templates in the document.
Template can be given by name (in that case search path is in
standard directory for bfx template) or directly using the template source.
If given, template_source overrides template_name
@param template_name the name of the BFX template, the same as the name of the filename without the extension
@return a DOM tree of the template
'''
if
template_source
is
None
:
template_file_name
=
CFG_BIBFORMAT_BFX_TEMPLATES_PATH
+
'/'
+
template_name
+
'.'
+
CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION
#load document
doc
=
minidom
.
parse
(
template_file_name
)
else
:
doc
=
minidom
.
parseString
(
template_source
)
#set exec flag to false and walk document to find templates and formats
self
.
flags
[
'exec'
]
=
False
self
.
walk
(
doc
)
#check found templates
if
self
.
start_template_name
:
start_template
=
self
.
templates
[
self
.
start_template_name
][
'node'
]
else
:
#print CFG_BIBFORMAT_BFX_WARNING_MESSAGES['WRN_BFX_NO_FORMAT_FOUND']
if
len
(
self
.
templates
)
==
1
:
# no format found, check if there is a default template
self
.
start_template_name
=
self
.
templates
.
keys
()[
0
]
start_template
=
self
.
templates
[
self
.
start_template_name
][
'node'
]
else
:
#no formats found, templates either zero or more than one
if
len
(
self
.
templates
)
>
1
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_TOO_MANY_TEMPLATES'
]
#else:
# print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_NO_TEMPLATES_FOUND']
return
None
self
.
flags
[
'exec'
]
=
True
return
start_template
def
parse_attribute
(
self
,
expression
):
'''
A function to check if an expression is of the special form [!name:display].
A short form for saying <bx:field name="name" display="tag">, used in element attributes.
@param expression a string, usually taken from an attribute value
@return if the string is special, parse it and return the corresponding value; else return the initial expression
'''
output
=
expression
pattern
=
'\[!(?P<tmp>[\w_.:]*)\]'
expr
=
re
.
compile
(
pattern
)
match
=
expr
.
match
(
expression
)
if
match
:
tmp
=
match
.
group
(
'tmp'
)
tmp
=
tmp
.
split
(
':'
)
var
=
tmp
[
0
]
display
=
''
if
len
(
tmp
)
==
2
:
display
=
tmp
[
1
]
output
=
self
.
translator
.
get_value
(
var
,
display
)
output
=
xml_escape
(
output
)
return
output
def
walk
(
self
,
parent
,
out_file
=
None
):
'''
Walk a template DOM tree.
The main function in the parser. It is recursively called until all the nodes are processed.
This function is used in two different ways:
- for initial loading of the template (and validation)
- for 'execution' of a format/template
The different behaviour is achieved through the use of flags, which can be set to True or False.
@param parent a node to process; in an API call this is the root node
@param out_file an object to write to; must have a 'write' method
@return None
'''
for
node
in
parent
.
childNodes
:
if
node
.
nodeType
==
Node
.
TEXT_NODE
:
value
=
get_node_value
(
node
)
value
=
value
.
strip
()
if
out_file
:
out_file
.
write
(
value
)
if
node
.
nodeType
==
Node
.
ELEMENT_NODE
:
#get values
name
,
attributes
,
element_namespace
=
get_node_name
(
node
),
get_node_attributes
(
node
),
get_node_namespace
(
node
)
# write values
if
element_namespace
!=
CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
:
#parse all the attributes
for
key
in
attributes
.
keys
():
attributes
[
key
]
=
self
.
parse_attribute
(
attributes
[
key
])
if
node_has_subelements
(
node
):
if
out_file
:
out_file
.
write
(
create_xml_element
(
name
=
name
,
attrs
=
attributes
,
element_type
=
xmlopen
))
self
.
walk
(
node
,
out_file
)
#walk subnodes
if
out_file
:
out_file
.
write
(
create_xml_element
(
name
=
name
,
element_type
=
xmlclose
))
else
:
if
out_file
:
out_file
.
write
(
create_xml_element
(
name
=
name
,
attrs
=
attributes
,
element_type
=
xmlempty
))
#name is a special name, must fall in one of the next cases:
elif
node
.
localName
==
'style'
:
self
.
ctl_style
(
node
,
out_file
)
elif
node
.
localName
==
'format'
:
self
.
ctl_format
(
node
,
out_file
)
elif
node
.
localName
==
'template'
:
self
.
ctl_template
(
node
,
out_file
)
elif
node
.
localName
==
'template_ref'
:
self
.
ctl_template_ref
(
node
,
out_file
)
elif
node
.
localName
==
'element'
:
self
.
ctl_element
(
node
,
out_file
)
elif
node
.
localName
==
'field'
:
self
.
ctl_field
(
node
,
out_file
)
elif
node
.
localName
==
'text'
:
self
.
ctl_text
(
node
,
out_file
)
elif
node
.
localName
==
'loop'
:
self
.
ctl_loop
(
node
,
out_file
)
elif
node
.
localName
==
'if'
:
self
.
ctl_if
(
node
,
out_file
)
elif
node
.
localName
==
'then'
:
self
.
ctl_then
(
node
,
out_file
)
elif
node
.
localName
==
'else'
:
self
.
ctl_else
(
node
,
out_file
)
elif
node
.
localName
==
'elif'
:
self
.
ctl_elif
(
node
,
out_file
)
else
:
if
node
.
localName
in
self
.
known_operators
:
print
'Note for programmer: you haven
\'
t implemented operator
%s
.'
%
(
name
)
else
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_INVALID_OPERATOR_NAME'
]
%
(
name
)
return
None
def
ctl_style
(
self
,
node
,
out_file
):
'''
Process a style root node.
'''
#exec mode
if
self
.
flags
[
'exec'
]:
return
None
#test mode
self
.
walk
(
node
,
out_file
)
return
None
def
ctl_format
(
self
,
node
,
out_file
):
'''
Process a format node.
Get name, description and content attributes.
This function is called only in test mode.
'''
#exec mode
if
self
.
flags
[
'exec'
]:
return
None
#test mode
attrs
=
get_node_attributes
(
node
)
#get template name and give control to ctl_template
if
attrs
.
has_key
(
'name'
):
name
=
attrs
[
'name'
]
if
self
.
templates
.
has_key
(
name
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_DUPLICATE_NAME'
]
%
(
name
)
return
None
self
.
start_template_name
=
name
self
.
ctl_template
(
node
,
out_file
)
else
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_TEMPLATE_NO_NAME'
]
return
None
return
None
def
ctl_template
(
self
,
node
,
out_file
):
'''
Process a template node.
Get name, description and content attributes.
Register name and store for later calls from template_ref.
This function is called only in test mode.
'''
#exec mode
if
self
.
flags
[
'exec'
]:
return
None
#test mode
attrs
=
get_node_attributes
(
node
)
#get template name
if
attrs
.
has_key
(
'name'
):
name
=
attrs
[
'name'
]
if
self
.
templates
.
has_key
(
name
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_DUPLICATE_NAME'
]
%
(
name
)
return
None
self
.
templates
[
name
]
=
{}
self
.
templates
[
name
][
'node'
]
=
node
else
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_TEMPLATE_NO_NAME'
]
return
None
#get template description
if
attrs
.
has_key
(
'description'
):
description
=
attrs
[
'description'
]
else
:
description
=
''
print
CFG_BIBFORMAT_BFX_WARNING_MESSAGES
[
'WRN_BFX_TEMPLATE_NO_DESCRIPTION'
]
self
.
templates
[
name
][
'description'
]
=
description
#get content-type of resulting output
if
attrs
.
has_key
(
'content'
):
content_type
=
attrs
[
'content'
]
else
:
content_type
=
'text/xml'
print
CFG_BIBFORMAT_BFX_WARNING_MESSAGES
[
'WRN_BFX_TEMPLATE_NO_CONTENT'
]
self
.
templates
[
name
][
'content_type'
]
=
content_type
#walk node
self
.
walk
(
node
,
out_file
)
return
None
def
ctl_template_ref
(
self
,
node
,
out_file
):
'''
Reference to an external template.
This function is called only in execution mode. Bad references appear as run-time errors.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
return
None
#exec mode
attrs
=
get_node_attributes
(
node
)
if
not
attrs
.
has_key
(
'name'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_TEMPLATE_REF_NO_NAME'
]
return
None
name
=
attrs
[
'name'
]
#first check for a template in the same file, that is in the already cached templates
if
self
.
templates
.
has_key
(
name
):
node_to_walk
=
self
.
templates
[
name
][
'node'
]
self
.
walk
(
node_to_walk
,
out_file
)
else
:
#load a file and execute it
pass
#template_file_name = CFG_BIBFORMAT_BFX_TEMPLATES_PATH + name + '/' + CFG_BIBFORMAT_BFX_FORMAT_TEMPLATE_EXTENSION
#try:
# node = minidom.parse(template_file_name)
#except:
# print CFG_BIBFORMAT_BFX_ERROR_MESSAGES['ERR_BFX_TEMPLATE_NOT_FOUND'] % (template_file_name)
return
None
def
ctl_element
(
self
,
node
,
out_file
):
'''
Call an external element (written in Python).
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
return
None
#exec mode
parameters
=
get_node_attributes
(
node
)
if
not
parameters
.
has_key
(
'name'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_ELEMENT_NO_NAME'
]
return
None
function_name
=
parameters
[
'name'
]
del
parameters
[
'name'
]
#now run external bfe_name.py, with param attrs
if
function_name
:
value
=
self
.
translator
.
call_function
(
function_name
,
parameters
)
value
=
xml_escape
(
value
)
out_file
.
write
(
value
)
return
None
def
ctl_field
(
self
,
node
,
out_file
):
'''
Get the value of a field by its name.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
return
None
#exec mode
attrs
=
get_node_attributes
(
node
)
if
not
attrs
.
has_key
(
'name'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_FIELD_NO_NAME'
]
return
None
display
=
''
if
attrs
.
has_key
(
'display'
):
display
=
attrs
[
'display'
]
var
=
attrs
[
'name'
]
if
not
self
.
translator
.
is_defined
(
var
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_NO_SUCH_FIELD'
]
%
(
var
)
return
None
value
=
self
.
translator
.
get_value
(
var
,
display
)
value
=
xml_escape
(
value
)
out_file
.
write
(
value
)
return
None
def
ctl_text
(
self
,
node
,
out_file
):
'''
Output a text
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
return
None
#exec mode
attrs
=
get_node_attributes
(
node
)
if
not
attrs
.
has_key
(
'value'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_TEXT_NO_VALUE'
]
return
None
value
=
attrs
[
'value'
]
value
=
value
.
replace
(
r'\n'
,
'
\n
'
)
#value = xml_escape(value)
if
type
(
value
)
==
type
(
u''
):
value
=
value
.
encode
(
'utf-8'
)
out_file
.
write
(
value
)
return
None
def
ctl_loop
(
self
,
node
,
out_file
):
'''
Loop through a set of values.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
self
.
walk
(
node
,
out_file
)
return
None
#exec mode
attrs
=
get_node_attributes
(
node
)
if
not
attrs
.
has_key
(
'object'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_LOOP_NO_OBJECT'
]
return
None
name
=
attrs
[
'object'
]
if
not
self
.
translator
.
is_defined
(
name
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_NO_SUCH_FIELD'
]
%
(
name
)
return
None
for
new_object
in
self
.
translator
.
iterator
(
name
):
self
.
walk
(
node
,
out_file
)
return
None
def
ctl_if
(
self
,
node
,
out_file
):
'''
An if/then/elif/.../elif/else construct.
'If' can have several forms:
<if name="var"/> : True if var is non-empty, eval as string
<if name="var" eq="value"/> : True if var=value, eval as string
<if name="var" lt="value"/> : True if var<value, try to eval as num, else eval as string
<if name="var" gt="value"/> : True if var>value, try to eval as num, else eval as string
<if name="var" le="value"/> : True if var<=value, try to eval as num, else eval as string
<if name="var" ge="value"/> : True if var>=value, try to eval as num, else eval as string
<if name="var" in="val1 val2"/> : True if var in [val1, val2], eval as string
<if name="var" nin="val1 val2"/> : True if var not in [val1, val2], eval as string
<if name="var" neq="value"/> : True if var!=value, eval as string
<if name="var" like="regexp"/> : Match against a regular expression
Example:
<if name="author" eq="Pauli">
<then>Pauli</then>
<elif name="" eq="Einstein">
<then>Pauli</then>
<else>other</else>
</elif>
</if>
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
self
.
walk
(
node
,
out_file
)
return
None
#exec mode
attrs
=
get_node_attributes
(
node
)
if
not
attrs
.
has_key
(
'name'
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_IF_NO_NAME'
]
return
None
#determine result
var
=
attrs
[
'name'
]
if
not
self
.
translator
.
is_defined
(
var
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_NO_SUCH_FIELD'
]
%
(
var
)
return
None
value
=
self
.
translator
.
get_value
(
var
)
value
=
value
.
strip
()
#equal
if
attrs
.
has_key
(
'eq'
):
pattern
=
attrs
[
'eq'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
==
float
(
pattern
))
else
:
result
=
(
value
==
pattern
)
#not equal
elif
attrs
.
has_key
(
'neq'
):
pattern
=
attrs
[
'neq'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
!=
float
(
pattern
))
else
:
result
=
(
value
!=
pattern
)
#lower than
elif
attrs
.
has_key
(
'lt'
):
pattern
=
attrs
[
'lt'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
<
float
(
pattern
))
else
:
result
=
(
value
<
pattern
)
#greater than
elif
attrs
.
has_key
(
'gt'
):
pattern
=
attrs
[
'gt'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
>
float
(
pattern
))
else
:
result
=
(
value
>
pattern
)
#lower or equal than
elif
attrs
.
has_key
(
'le'
):
pattern
=
attrs
[
'le'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
<=
float
(
pattern
))
else
:
result
=
(
value
<=
pattern
)
#greater or equal than
elif
attrs
.
has_key
(
'ge'
):
pattern
=
attrs
[
'ge'
]
if
is_number
(
pattern
)
and
is_number
(
value
):
result
=
(
float
(
value
)
>=
float
(
pattern
))
else
:
result
=
(
value
>=
pattern
)
#in
elif
attrs
.
has_key
(
'in'
):
pattern
=
attrs
[
'in'
]
values
=
pattern
.
split
()
result
=
(
value
in
values
)
#not in
elif
attrs
.
has_key
(
'nin'
):
pattern
=
attrs
[
'nin'
]
values
=
pattern
.
split
()
result
=
(
value
not
in
values
)
#match against a regular expression
elif
attrs
.
has_key
(
'like'
):
pattern
=
attrs
[
'like'
]
try
:
expr
=
re
.
compile
(
pattern
)
result
=
expr
.
match
(
value
)
except
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_INVALID_RE'
]
%
(
pattern
)
#simple form: True if non-empty, otherwise False
else
:
result
=
value
#end of evaluation
#=================
#validate subnodes
then_node
=
get_node_subelement
(
node
,
'then'
,
CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
)
else_node
=
get_node_subelement
(
node
,
'else'
,
CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
)
elif_node
=
get_node_subelement
(
node
,
'elif'
,
CFG_BIBFORMAT_BFX_ELEMENT_NAMESPACE
)
#having else and elif siblings at the same time is a syntax error
if
(
else_node
is
not
None
)
and
(
elif_node
is
not
None
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_IF_WRONG_SYNTAX'
]
return
None
#now walk appropriate nodes, according to the result
if
result
:
#True
if
then_node
:
self
.
walk
(
then_node
,
out_file
)
#todo: add short form, without 'then', just elements within if statement to walk on 'true' and no 'elif' or 'else' elements
else
:
#False
if
elif_node
:
self
.
ctl_if
(
elif_node
,
out_file
)
elif
else_node
:
self
.
walk
(
else_node
,
out_file
)
return
None
def
ctl_then
(
self
,
node
,
out_file
):
'''
Calling 'then' directly from the walk function means a syntax error.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
self
.
walk
(
node
,
out_file
)
return
None
#exec mode
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_IF_WRONG_SYNTAX'
]
return
None
def
ctl_else
(
self
,
node
,
out_file
):
'''
Calling 'else' directly from the walk function means a syntax error.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
self
.
walk
(
node
,
out_file
)
return
None
#exec mode
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_IF_WRONG_SYNTAX'
]
return
None
def
ctl_elif
(
self
,
node
,
out_file
):
'''
Calling 'elif' directly from the walk function means a syntax error.
'''
#test mode
if
not
self
.
flags
[
'exec'
]:
self
.
walk
(
node
,
out_file
)
return
None
#exec mode
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_IF_WRONG_SYNTAX'
]
return
None
class
MARCTranslator
:
'''
memory[name]
[name]['addresses'] - the set of rules for each of the defined names
[name]['parent'] - the name of the parent; '' if none;
[name]['children'] - a list with the name of the children of every variable
[name]['object'] - stored state of object for performance efficiency
'''
def
__init__
(
self
,
labels
=
None
):
'''
Create an instance of the translator and init with the list of the defined labels and their rules.
'''
if
labels
is
None
:
labels
=
{}
self
.
recIDs
=
[]
self
.
recID
=
0
self
.
recID_index
=
0
self
.
record
=
None
self
.
memory
=
{}
pattern
=
address_pattern
expr
=
re
.
compile
(
pattern
)
for
name
in
labels
.
keys
():
self
.
memory
[
name
]
=
{}
self
.
memory
[
name
][
'object'
]
=
None
self
.
memory
[
name
][
'parent'
]
=
''
self
.
memory
[
name
][
'children'
]
=
[]
self
.
memory
[
name
][
'addresses'
]
=
p_copy
.
deepcopy
(
labels
[
name
])
for
name
in
self
.
memory
:
for
i
in
range
(
len
(
self
.
memory
[
name
][
'addresses'
])):
address
=
self
.
memory
[
name
][
'addresses'
][
i
]
match
=
expr
.
match
(
address
)
if
not
match
:
print
'Invalid address: '
,
name
,
address
else
:
parent_name
=
match
.
group
(
'parent'
)
if
parent_name
:
if
not
self
.
memory
.
has_key
(
parent_name
):
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_NO_SUCH_FIELD'
]
%
(
parent_name
)
else
:
self
.
memory
[
name
][
'parent'
]
=
parent_name
#now make parent aware of children
if
not
name
in
self
.
memory
[
parent_name
][
'children'
]:
self
.
memory
[
parent_name
][
'children'
]
.
append
(
name
)
level
=
self
.
determine_level
(
parent_name
)
self
.
memory
[
name
][
'addresses'
][
i
]
=
self
.
memory
[
name
][
'addresses'
][
i
]
.
replace
(
parent_name
,
'/'
*
level
)
#special case 'record'
self
.
memory
[
'record'
]
=
{}
self
.
memory
[
'record'
][
'object'
]
=
None
self
.
memory
[
'record'
][
'parent'
]
=
''
self
.
memory
[
'record'
][
'children'
]
=
[]
def
set_record_ids
(
self
,
recIDs
,
preprocess
=
None
):
'''
Initialize the translator with the set of record IDs.
@param recIDs a list of the record IDs
@param preprocess an optional function which acts on every record structure after creating it
This can be used to enrich the record with fields not present in the record initially,
verify the record data or whatever plausible.
Another solution is to use external function elements.
'''
self
.
record
=
None
self
.
recIDs
=
recIDs
self
.
preprocess
=
preprocess
if
self
.
recIDs
:
self
.
recID_index
=
0
self
.
recID
=
self
.
recIDs
[
self
.
recID_index
]
self
.
record
=
get_record
(
self
.
recID
)
if
self
.
preprocess
:
self
.
preprocess
(
self
.
record
)
return
None
def
determine_level
(
self
,
name
):
'''
Determine the type of the variable, whether this is an instance or a subfield.
This is done by observing the first provided address for the name.
todo: define variable types in config file, remove this function, results in a clearer concept
'''
level
=
0
#default value
if
self
.
memory
.
has_key
(
name
):
expr
=
re
.
compile
(
address_pattern
)
if
self
.
memory
[
name
][
'addresses'
]:
match
=
expr
.
match
(
self
.
memory
[
name
][
'addresses'
][
0
])
if
match
:
tag
=
match
.
group
(
'tag'
)
code
=
match
.
group
(
'code'
)
reg
=
match
.
group
(
'reg'
)
if
reg
:
level
=
2
#subfield
elif
code
:
level
=
2
#subfield
elif
tag
:
level
=
1
#instance
return
level
#========================================
#API functions for quering the translator
#========================================
def
is_defined
(
self
,
name
):
'''
Check whether a variable is defined.
@param name the name of the variable
'''
return
self
.
memory
.
has_key
(
name
)
def
get_num_elements
(
self
,
name
):
'''
An API function to get the number of elements for a variable.
Do not use this function to build loops, Use iterator instead.
'''
if
name
==
'record'
:
return
len
(
self
.
recIDs
)
num
=
0
for
part
in
self
.
iterator
(
name
):
num
=
num
+
1
return
num
def
get_value
(
self
,
name
,
display_type
=
'value'
):
'''
The API function for quering the translator for values of a certain variable.
Called in a loop will result in a different value each time.
Objects are cached in memory, so subsequent calls for the same variable take less time.
@param name the name of the variable you want the value of
@param display_type an optional value for the type of the desired output, one of: value, tag, ind1, ind2, code, fulltag;
These can be easily added in the proper place of the code (display_value)
'''
if
name
==
'record'
:
return
''
record
=
self
.
get_object
(
name
)
return
self
.
display_record
(
record
,
display_type
)
def
iterator
(
self
,
name
):
'''
An iterator over the values of a certain name.
The iterator changes state of internal variables and objects.
When calling get_value in a loop, this will result each time in a different value.
'''
if
name
==
'record'
:
for
self
.
recID
in
self
.
recIDs
:
self
.
record
=
get_record
(
self
.
recID
)
if
self
.
preprocess
:
self
.
preprocess
(
self
.
record
)
yield
str
(
self
.
recID
)
else
:
full_object
=
self
.
build_object
(
name
)
level
=
self
.
determine_level
(
name
)
for
new_object
in
record_parts
(
full_object
,
level
):
self
.
memory
[
name
][
'object'
]
=
new_object
#parent has changed state; also set childs state to None;
for
children_name
in
self
.
memory
[
name
][
'children'
]:
self
.
memory
[
children_name
][
'object'
]
=
None
yield
new_object
#the result for a call of the same name after an iterator should be the same as if there was no iterator called before
self
.
memory
[
name
][
'object'
]
=
None
def
call_function
(
self
,
function_name
,
parameters
=
None
):
'''
Call an external element which is a Python file, using BibFormat
@param function_name the name of the function to call
@param parameters a dictionary of the parameters to pass as key=value pairs
@return a string value, which is the result of the function call
'''
if
parameters
is
None
:
parameters
=
{}
bfo
=
BibFormatObject
(
self
.
recID
)
format_element
=
get_format_element
(
function_name
)
(
value
,
errors
)
=
eval_format_element
(
format_element
,
bfo
,
parameters
)
#to do: check errors from function call
return
value
#========================================
#end of API functions
#========================================
def
get_object
(
self
,
name
):
'''
Responsible for creating the desired object, corresponding to provided name.
If object is not cached in memory, it is build again.
Directly called by API function get_value.
The result is then formatted by display_record according to display_type.
'''
if
self
.
memory
[
name
][
'object'
]
is
not
None
:
return
self
.
memory
[
name
][
'object'
]
new_object
=
self
.
build_object
(
name
)
#if you have reached here you are not in an iterator; return first non-empty
level
=
self
.
determine_level
(
name
)
for
tmp_object
in
record_parts
(
new_object
,
level
):
#get the first non-empty
if
tmp_object
:
new_object
=
tmp_object
break
self
.
memory
[
name
][
'object'
]
=
new_object
return
new_object
def
build_object
(
self
,
name
):
'''
Build the object from the list of addresses
A slave function for get_object.
'''
new_object
=
{}
parent_name
=
self
.
memory
[
name
][
'parent'
];
has_parent
=
parent_name
for
address
in
self
.
memory
[
name
][
'addresses'
]:
if
not
has_parent
:
tmp_object
=
copy
(
self
.
record
,
address
)
new_object
=
merge
(
new_object
,
tmp_object
)
else
:
#has parent
parent_object
=
self
.
get_object
(
parent_name
)
#already returns the parents instance
tmp_object
=
copy
(
parent_object
,
address
)
new_object
=
merge
(
new_object
,
tmp_object
)
return
new_object
def
display_record
(
self
,
record
,
display_type
=
'value'
):
'''
Decide what the final output value is according to the display_type.
@param record the record structure to display; this is most probably just a single subfield
@param display_type a string specifying the desired output; can be one of: value, tag, ind1, ind2, code, fulltag
@return a string to output
'''
output
=
''
tag
,
ind1
,
ind2
,
code
,
value
=
''
,
''
,
''
,
''
,
''
if
record
:
tags
=
record
.
keys
()
tags
.
sort
()
if
tags
:
fulltag
=
tags
[
0
]
tag
,
ind1
,
ind2
=
fulltag
[
0
:
3
],
fulltag
[
3
:
4
],
fulltag
[
4
:
5
]
field_instances
=
record
[
fulltag
]
if
field_instances
:
field_instance
=
field_instances
[
0
]
codes
=
field_instance
.
keys
()
codes
.
sort
()
if
codes
:
code
=
codes
[
0
]
value
=
field_instance
[
code
]
if
not
display_type
:
display_type
=
'value'
if
display_type
==
'value'
:
output
=
value
elif
display_type
==
'tag'
:
output
=
tag
elif
display_type
==
'ind1'
:
ind1
=
ind1
.
replace
(
'_'
,
' '
)
output
=
ind1
elif
display_type
==
'ind2'
:
ind2
=
ind2
.
replace
(
'_'
,
' '
)
output
=
ind2
elif
display_type
==
'code'
:
output
=
code
elif
display_type
==
'fulltag'
:
output
=
tag
+
ind1
+
ind2
else
:
print
CFG_BIBFORMAT_BFX_ERROR_MESSAGES
[
'ERR_BFX_INVALID_DISPLAY_TYPE'
]
%
(
display_type
)
return
output
'''
Functions for use with the structure representing a MARC record defined here.
This record structure differs from the one defined in bibrecord.
The reason is that we want a symmetry between controlfields and datafields.
In this format controlfields are represented internally as a subfield value with code ' ' of a datafield.
This allows for easier handling of the fields.
However, there is a restriction associated with this structure and it is that subfields cannot be repeated
in the same instance. If this is the case, the result will be incorrect.
The record structure has the form:
fields={field_tag:field_instances}
field_instances=[field_instance]
field_instance={field_code:field_value}
'''
def
convert_record
(
old_record
):
'''
Convert a record from the format defined in bibrecord to the format defined here
@param old_record the record as returned from bibrecord.create_record()
@return a record of the new form
'''
fields
=
{}
old_tags
=
old_record
.
keys
()
old_tags
.
sort
()
for
old_tag
in
old_tags
:
if
int
(
old_tag
)
<
11
:
#controlfields
new_tag
=
old_tag
fields
[
new_tag
]
=
[{
' '
:
old_record
[
old_tag
][
0
][
3
]}]
else
:
#datafields
old_field_instances
=
old_record
[
old_tag
]
num_fields
=
len
(
old_field_instances
)
for
i
in
range
(
num_fields
):
old_field_instance
=
old_field_instances
[
i
]
ind1
=
old_field_instance
[
1
]
if
not
ind1
or
ind1
==
' '
:
ind1
=
'_'
ind2
=
old_field_instance
[
2
]
if
not
ind2
or
ind2
==
' '
:
ind2
=
'_'
new_tag
=
old_tag
+
ind1
+
ind2
new_field_instance
=
{}
for
old_subfield
in
old_field_instance
[
0
]:
new_code
=
old_subfield
[
0
]
new_value
=
old_subfield
[
1
]
if
new_field_instance
.
has_key
(
new_code
):
print
'Error: Repeating subfield codes in the same instance!'
new_field_instance
[
new_code
]
=
new_value
if
not
fields
.
has_key
(
new_tag
):
fields
[
new_tag
]
=
[]
fields
[
new_tag
]
.
append
(
new_field_instance
)
return
fields
def
get_record
(
recID
):
'''
Get a record with a specific recID.
@param recID the ID of the record
@return a record in the structure defined here
'''
bfo
=
BibFormatObject
(
recID
)
return
convert_record
(
bfo
.
get_record
())
def
print_record
(
record
):
'''
Print a record.
'''
tags
=
record
.
keys
()
tags
.
sort
()
for
tag
in
tags
:
field_instances
=
record
[
tag
]
for
field_instance
in
field_instances
:
print
tag
,
field_instance
def
record_fields_value
(
record
,
tag
,
subfield
):
'''
Return a list of all the fields with a certain tag and subfield code.
Works on subfield level.
@param record a record
@param tag a 3 or 5 letter tag; required
@param subfield a subfield code; required
'''
output
=
[]
if
record
.
has_key
(
tag
):
for
field_instance
in
record
[
tag
]:
if
field_instance
.
has_key
(
subfield
):
output
.
append
(
field_instance
[
subfield
])
return
output
def
record_add_field_instance
(
record
,
tag
,
field_instance
):
'''
Add a field_instance to the beginning of the instances of a corresponding tag.
@param record a record
@param tag a 3 or 5 letter tag; required
@param field_instance the field instance to add
@return None
'''
if
not
record
.
has_key
(
tag
):
record
[
tag
]
=
[]
record
[
tag
]
=
[
field_instance
]
+
record
[
tag
]
return
None
def
record_num_parts
(
record
,
level
):
'''
Count the number of instances or the number of subfields in the whole record.
@param record
@param level either 1 or 2
level=1 - view record on instance level
level=2 - view record on subfield level
@return the number of parts
'''
num
=
0
for
part
in
record_parts
(
record
,
level
):
num
=
num
+
1
def
record_parts
(
record
,
level
):
'''
An iterator over the instances or subfields of a record.
@param record
@param level either 1 or 2
level=1 - iterate over instances
level=2 - iterate over subfields
@yield a record structure representing the part (instance or subfield)
'''
if
level
==
1
:
names
=
record
.
keys
()
names
.
sort
()
for
name
in
names
:
old_field_instances
=
record
[
name
]
for
old_field_instance
in
old_field_instances
:
new_record
=
{}
new_field_instances
=
[]
new_field_instance
=
{}
for
old_field_code
in
old_field_instance
.
keys
():
new_field_code
=
old_field_code
new_field_value
=
old_field_instance
[
old_field_code
]
new_field_instance
[
new_field_code
]
=
new_field_value
new_field_instances
.
append
(
new_field_instance
)
new_record
[
name
]
=
[]
new_record
[
name
]
.
extend
(
new_field_instances
)
yield
new_record
if
level
==
2
:
names
=
record
.
keys
()
names
.
sort
()
for
name
in
names
:
old_field_instances
=
record
[
name
]
for
old_field_instance
in
old_field_instances
:
old_field_codes
=
old_field_instance
.
keys
()
old_field_codes
.
sort
()
for
old_field_code
in
old_field_codes
:
new_record
=
{}
new_field_instances
=
[]
new_field_instance
=
{}
new_field_code
=
old_field_code
new_field_value
=
old_field_instance
[
old_field_code
]
new_field_instance
[
new_field_code
]
=
new_field_value
new_field_instances
.
append
(
new_field_instance
)
new_record
[
name
]
=
[]
new_record
[
name
]
.
extend
(
new_field_instances
)
yield
new_record
def
copy
(
old_record
,
address
=
''
):
'''
Copy a record by filtering all parts of the old record specified by address
(A better name for the function is filter.)
@param record the initial record
@param address an address; for examples see bibformat_bfx_engine_config.
If no address is specified, return the initial record.
@return the filtered record
'''
if
not
old_record
:
return
{}
tag_pattern
,
code_pattern
,
reg_pattern
=
''
,
''
,
''
expr
=
re
.
compile
(
address_pattern
)
match
=
expr
.
match
(
address
)
if
match
:
tag_pattern
=
match
.
group
(
'tag'
)
code_pattern
=
match
.
group
(
'code'
)
reg_pattern
=
match
.
group
(
'reg'
)
if
tag_pattern
:
tag_pattern
=
tag_pattern
.
replace
(
'?'
,
'[0-9_\w]'
)
else
:
tag_pattern
=
r'.*'
if
code_pattern
:
code_pattern
=
code_pattern
.
replace
(
'?'
,
'[\w ]'
)
else
:
code_pattern
=
r'.*'
tag_expr
=
re
.
compile
(
tag_pattern
)
code_expr
=
re
.
compile
(
code_pattern
)
new_record
=
{}
for
tag
in
old_record
.
keys
():
tag_match
=
tag_expr
.
match
(
tag
)
if
tag_match
:
if
tag_match
.
end
()
==
len
(
tag
):
old_field_instances
=
old_record
[
tag
]
new_field_instances
=
[]
for
old_field_instance
in
old_field_instances
:
new_field_instance
=
{}
for
old_field_code
in
old_field_instance
.
keys
():
new_field_code
=
old_field_code
code_match
=
code_expr
.
match
(
new_field_code
)
if
code_match
:
new_field_value
=
old_field_instance
[
old_field_code
]
new_field_instance
[
new_field_code
]
=
new_field_value
if
new_field_instance
:
new_field_instances
.
append
(
new_field_instance
)
if
new_field_instances
:
new_record
[
tag
]
=
new_field_instances
#in new_record pass all subfields through regexp
if
reg_pattern
:
for
tag
in
new_record
:
field_instances
=
new_record
[
tag
]
for
field_instance
in
field_instances
:
field_codes
=
field_instance
.
keys
()
for
field_code
in
field_codes
:
field_instance
[
field_code
]
=
pass_through_regexp
(
field_instance
[
field_code
],
reg_pattern
)
return
new_record
def
merge
(
record1
,
record2
):
'''
Merge two records.
Controlfields with the same tag in record2 as in record1 are ignored.
@param record1, record2
@return the merged record
'''
new_record
=
{}
if
record1
:
new_record
=
copy
(
record1
)
if
not
record2
:
return
new_record
for
tag
in
record2
.
keys
():
#append only datafield tags;
#if controlfields conflict, leave first;
old_field_instances
=
record2
[
tag
]
new_field_instances
=
[]
for
old_field_instance
in
old_field_instances
:
new_field_instance
=
{}
for
old_field_code
in
old_field_instance
.
keys
():
new_field_code
=
old_field_code
new_field_value
=
old_field_instance
[
old_field_code
]
new_field_instance
[
new_field_code
]
=
new_field_value
if
new_field_instance
:
new_field_instances
.
append
(
new_field_instance
)
if
new_field_instances
:
#controlfield
if
len
(
tag
)
==
3
:
if
not
new_record
.
has_key
(
tag
):
new_record
[
tag
]
=
[]
new_record
[
tag
]
.
extend
(
new_field_instances
)
#datafield
if
len
(
tag
)
==
5
:
if
not
new_record
.
has_key
(
tag
):
new_record
[
tag
]
=
[]
new_record
[
tag
]
.
extend
(
new_field_instances
)
return
new_record
#======================
#Help functions
#=====================
xmlopen
=
1
xmlclose
=
2
xmlfull
=
3
xmlempty
=
4
def
create_xml_element
(
name
,
value
=
''
,
attrs
=
None
,
element_type
=
xmlfull
,
level
=
0
):
'''
Create a XML element as string.
@param name the name of the element
@param value the element value; default is ''
@param attrs a dictionary with the element attributes
@param element_type a constant which defines the type of the output
xmlopen = 1 <element attr="attr_value">
xmlclose = 2 </element>
xmlfull = 3 <element attr="attr_value">value</element>
xmlempty = 4 <element attr="attr_value"/>
@return a formatted XML string
'''
output
=
''
if
attrs
is
None
:
attrs
=
{}
if
element_type
==
xmlempty
:
output
+=
'<'
+
name
for
attrname
in
attrs
.
keys
():
attrvalue
=
attrs
[
attrname
]
if
type
(
attrvalue
)
==
type
(
u''
):
attrvalue
=
attrvalue
.
encode
(
'utf-8'
)
output
+=
'
%s
="
%s
"'
%
(
attrname
,
attrvalue
)
output
+=
' />'
if
element_type
==
xmlfull
:
output
+=
'<'
+
name
for
attrname
in
attrs
.
keys
():
attrvalue
=
attrs
[
attrname
]
if
type
(
attrvalue
)
==
type
(
u''
):
attrvalue
=
attrvalue
.
encode
(
'utf-8'
)
output
+=
'
%s
="
%s
"'
%
(
attrname
,
attrvalue
)
output
+=
'>'
output
+=
value
output
+=
'</'
+
name
+
'>'
if
element_type
==
xmlopen
:
output
+=
'<'
+
name
for
attrname
in
attrs
.
keys
():
output
+=
' '
+
attrname
+
'="'
+
attrs
[
attrname
]
+
'"'
output
+=
'>'
if
element_type
==
xmlclose
:
output
+=
'</'
+
name
+
'>'
output
=
' '
*
level
+
output
if
type
(
output
)
==
type
(
u''
):
output
=
output
.
encode
(
'utf-8'
)
return
output
def
xml_escape
(
value
):
'''
Escape a string value for use as a xml element or attribute value.
@param value the string value to escape
@return escaped value
'''
return
saxutils
.
escape
(
value
)
def
xml_unescape
(
value
):
'''
Unescape a string value for use as a xml element.
@param value the string value to unescape
@return unescaped value
'''
return
saxutils
.
unescape
(
value
)
def
node_has_subelements
(
node
):
'''
Check if a node has any childnodes.
Check for element or text nodes.
@return True if childnodes exist, False otherwise.
'''
result
=
False
for
node
in
node
.
childNodes
:
if
node
.
nodeType
==
Node
.
ELEMENT_NODE
or
node
.
nodeType
==
Node
.
TEXT_NODE
:
result
=
True
return
result
def
get_node_subelement
(
parent_node
,
name
,
namespace
=
None
):
'''
Get the first childnode with specific name and (optional) namespace
@param parent_node the node to check
@param name the name to search
@param namespace An optional namespace URI. This is usually a URL: http://cdsware.cern.ch/invenio/
@return the found node; None otherwise
'''
output
=
None
for
node
in
parent_node
.
childNodes
:
if
node
.
nodeType
==
Node
.
ELEMENT_NODE
and
node
.
localName
==
name
and
node
.
namespaceURI
==
namespace
:
output
=
node
return
output
return
output
def
get_node_value
(
node
):
'''
Get the node value of a node. For use with text nodes.
@param node a text node
@return a string of the nodevalue encoded in utf-8
'''
return
node
.
nodeValue
.
encode
(
'utf-8'
)
def
get_node_namespace
(
node
):
'''
Get node namespace. For use with element nodes.
@param node an element node
@return the namespace of the node
'''
return
node
.
namespaceURI
def
get_node_name
(
node
):
'''
Get the node value of a node. For use with element nodes.
@param node an element node
@return a string of the node name
'''
return
node
.
nodeName
def
get_node_attributes
(
node
):
'''
Get attributes of an element node. For use with element nodes
@param node an element node
@return a dictionary of the attributes as key:value pairs
'''
attributes
=
{}
attrs
=
node
.
attributes
for
attrname
in
attrs
.
keys
():
attrnode
=
attrs
.
get
(
attrname
)
attrvalue
=
attrnode
.
nodeValue
attributes
[
attrname
]
=
attrvalue
return
attributes
def
pass_through_regexp
(
value
,
regexp
):
'''
Pass a value through a regular expression.
@param value a string
@param regexp a regexp with a group 'value' in it. No group named 'value' will result in an error.
@return if the string matches the regexp, return named group 'value', otherwise return ''
'''
output
=
''
expr
=
re
.
compile
(
regexp
)
match
=
expr
.
match
(
value
)
if
match
:
output
=
match
.
group
(
'value'
)
return
output
def
is_number
(
value
):
'''
Check if a value is a number.
@param value the value to check
@return True or False
'''
result
=
True
try
:
float
(
value
)
except
ValueError
:
result
=
False
return
result
Event Timeline
Log In to Comment