Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F87624786
reader.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Oct 13, 18:43
Size
24 KB
Mime Type
text/x-python
Expires
Tue, Oct 15, 18:43 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
21585597
Attached To
R3600 invenio-infoscience
reader.py
View Options
# -*- coding: utf-8 -*-
##
## This file is part of Invenio.
## Copyright (C) 2013, 2014 CERN.
##
## Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""Default JSONAlchemy reader.
It provides the common functionality to use by the readers.
Typically this class should be used as a factory to create the concrete
reader depending of the master format of the input.
>>> from invenio.modules.jsonalchemy.reader import Reader
>>> from invenio.modules.readers.api import Record
>>> record = Reader.translate(blob, 'marc', Record, model=['picture'])
"""
import
itertools
import
datetime
import
six
from
invenio.base.utils
import
try_to_eval
from
.errors
import
ReaderException
from
.parser
import
FieldParser
,
ModelParser
from
.registry
import
functions
,
readers
def
split_blob
(
blob
,
master_format
,
slice_size
=
0
,
**
kwargs
):
"""@todo: Docstring for split_blob.
:blob: @todo
:master_format: @todo
:slice_size: @todo
:**kwargs: @todo
:returns: @todo
"""
def
grouper
(
n
,
iterable
):
iter_
=
iter
(
iterable
)
while
True
:
chunk
=
tuple
(
itertools
.
islice
(
iter_
,
n
))
if
not
chunk
:
return
yield
chunk
if
slice_size
==
0
:
return
readers
[
master_format
]
.
split_blob
(
blob
,
**
kwargs
)
else
:
return
grouper
(
slice_size
,
readers
[
master_format
]
.
split_blob
(
blob
,
**
kwargs
))
class
Reader
(
object
):
# pylint: disable=R0921
"""Base reader."""
def
__new__
(
cls
,
json
,
blob
=
None
,
**
kwargs
):
# pylint: disable=W0613
try
:
master_format
=
json
.
additional_info
.
master_format
return
super
(
Reader
,
cls
)
.
__new__
(
readers
[
master_format
])
except
KeyError
as
e
:
raise
KeyError
(
"Not reader found for '
%s
'"
%
(
e
.
message
,
))
def
__init__
(
self
,
json
,
blob
=
None
,
**
kwargs
):
self
.
_blob
=
blob
if
blob
is
not
None
or
kwargs
.
get
(
'no_blob'
,
False
)
\
else
json
.
get_blob
()
self
.
_json
=
json
self
.
_parsed
=
[]
@staticmethod
def
split_blob
(
blob
,
schema
=
None
,
**
kwargs
):
"""
In case of several records inside the blob this method specify how to
split then and work one by one afterwards.
"""
raise
NotImplementedError
()
@classmethod
def
translate
(
cls
,
blob
,
json_class
,
master_format
=
'json'
,
**
kwargs
):
"""
Transforms the incoming blob into a json structure (``json_class``)
using the rules describes in the field and model definitions.
:param blob: incoming blob (like MARC)
:param json_class: Any subclass of
:class:`~invenio.modules.jsonalchemy.wrappers.SmartJson`
:param master_format: Master format of the input blob.
:param kwargs: parameter to pass to json_class
:return: New object of ``json_class`` type containing the result of the
translation
"""
from
.wrappers
import
SmartJson
if
blob
is
None
:
raise
ReaderException
(
"To perform a 'translate' operation a blob is needed"
)
if
not
issubclass
(
json_class
,
SmartJson
):
raise
ReaderException
(
"The json class must be of type 'SmartJson'"
)
json
=
json_class
(
master_format
=
master_format
,
**
kwargs
)
# fill up with all possible fields
fields
=
ModelParser
.
resolve_models
(
json
.
model_info
.
names
,
json
.
additional_info
.
namespace
)
.
get
(
'fields'
)
cls
.
add
(
json
,
fields
,
blob
,
fetch_model_info
=
True
)
return
json
@classmethod
def
add
(
cls
,
json
,
fields
,
blob
=
None
,
fetch_model_info
=
False
):
"""
Adds the list of fields to the json structure, if fields is ``None``
it adds all the possible fields from the current model.
:param json: Any ``SmartJson`` object
:param fields: Dict of fields to be added to the json structure
containing field_name:json_id
"""
reader
=
cls
(
json
,
blob
)
reader
.
_prepare_blob
()
if
fetch_model_info
:
reader
.
_process_model_info
()
if
isinstance
(
fields
,
six
.
string_types
):
fields
=
(
fields
,
)
if
isinstance
(
fields
,
(
list
,
tuple
)):
model_fields
=
ModelParser
.
resolve_models
(
json
.
model_info
.
names
,
json
.
additional_info
.
namespace
)
.
get
(
'fields'
)
fields
=
dict
(
(
field_name
,
model_fields
.
get
(
field_name
,
field_name
))
for
field_name
in
fields
)
for
json_id
,
field_name
in
six
.
iteritems
(
fields
):
reader
.
_unpack_rule
(
json_id
,
field_name
)
reader
.
_post_process_json
()
@classmethod
def
set
(
cls
,
json
,
field
,
value
=
None
,
set_default_value
=
False
):
"""
When adding a new field to the json object finds as much information
about it as possible and attaches it to the json object inside
``json['__meta_metadata__'][field]``.
:param json: Any ``SmartJson`` object
:param field: Name of the new field to be added
:param value: New value for the field (if not ``None``)
:param set_default_value: If set to ``True`` looks for the default
value if any and sets it.
"""
reader
=
None
json_id
=
None
if
field
not
in
json
.
meta_metadata
:
# We don't have any meta_metadata, look for it.
reader
=
cls
(
json
=
json
,
no_blob
=
True
)
json_id
=
\
ModelParser
.
resolve_models
(
json
.
model_info
,
json
.
additional_info
.
namespace
)[
'fields'
]
.
get
(
field
,
field
)
json
[
'__meta_metadata__'
][
field
]
=
\
reader
.
_find_field_metadata
(
json_id
,
field
)
if
value
:
json
[
field
]
=
value
elif
set_default_value
:
if
reader
is
None
:
reader
=
cls
(
json
=
json
,
no_blob
=
True
)
json_id
=
\
ModelParser
.
resolve_models
(
json
.
model_info
,
json
.
additional_info
.
namespace
)[
'fields'
]
.
get
(
field
,
field
)
reader
.
_set_default_value
(
json_id
,
field
)
reader
.
_evaluate_after_decorators
(
field
)
@classmethod
def
update
(
cls
,
json
,
fields
,
blob
=
None
,
update_db
=
False
):
"""
Updates the fields given from the json structure.
:param json: Any ``SmartJson`` object
:param blob: incoming blob (like MARC), if ``None``, ``json.get_blob``
will be used to retrieve it if needed.
:param fields: List of fields to be updated, if ``None`` all fields
will be updated.
:param save: If set to ``True`` a 'soft save' will be performed with
the changes.
"""
reader
=
cls
(
json
=
json
,
blob
=
blob
if
blob
else
json
.
get_blob
())
reader
.
_update
(
fields
)
if
update_db
:
json
.
update
()
@classmethod
def
process_model_info
(
cls
,
json
):
"""
Fetches all the possible information about the current models and
applies all the model extensions `evaluate` methods if any extension is
used
"""
reader
=
cls
(
json
,
no_blob
=
True
)
reader
.
_process_model_info
()
@classmethod
def
update_meta_metadata
(
cls
,
json
,
blob
=
None
,
fields
=
None
,
section
=
None
,
keep_core_values
=
True
,
store_backup
=
True
):
"""
Updates the meta-metadata for a guiven set of fields (if ``None`` all
fields will be used).
"""
reader
=
cls
(
json
,
blob
)
reader
.
_update_meta_metadata
(
fields
,
section
,
keep_core_values
,
store_backup
)
def
_process_model_info
(
self
):
"""
Dummy method to guess the model of a given input.
Should be redefined in the dedicated readers.
:return: List of models found in the blob
"""
if
self
.
_json
.
model_info
.
names
==
[
'__default__'
]:
self
.
_json
[
'__meta_metadata__'
][
'__model_info__'
][
'names'
]
=
\
self
.
_guess_model_from_input
()
model
=
ModelParser
.
resolve_models
(
self
.
_json
.
model_info
.
names
,
self
.
_json
.
additional_info
.
namespace
)
for
key
,
value
in
six
.
iteritems
(
model
):
if
key
in
(
'fields'
,
'bases'
):
continue
ModelParser
.
parser_extensions
()[
key
]
.
evaluate
(
self
.
_json
,
value
)
def
_guess_model_from_input
(
self
):
"""
Dummy method to guess the model of a given input.
Should be redefined in the dedicated readers.
:return: List of models found in the blob
"""
return
[
'__default__'
]
def
_prepare_blob
(
self
,
*
args
,
**
kwargs
):
"""
Responsible of doing any kind of transformation over the blob before
the translation begins.
It should create a common structure that all the methods, specially
``_get_elements_from_blob`` understand.
"""
raise
NotImplementedError
()
def
_post_process_json
(
self
):
"""
Responsible of doing any kind of transformation over the json structure
after it is created, e.g. pruning the json to delete singletons.
"""
pass
def
_get_elements_from_blob
(
self
,
regex_key
):
"""
Like ``get`` for a normal python dictionary but in this case it should
handle 'entire_record' and '*' as key.
:param regex_key: key to access the intermediate structure, could be a
plain string or a python regular expression.
:return: List containing the values matching the regex_key
"""
raise
NotImplementedError
()
def
_unpack_rule
(
self
,
json_id
,
field_name
=
None
):
"""
From the field definitions extract the rules an tries to apply them to
fill up the current json.
:param json_id: key to access the field description in
``FieldParser.field_definitions``
:param field_name: future name of the field in the json structure, if
``None`` json_id will be used.
:return: ``True`` if the rule for ``json_id`` was applied successfully,
``False`` otherwise.
"""
try
:
rule
=
FieldParser
.
field_definitions
(
self
.
_json
.
additional_info
.
namespace
)[
json_id
]
except
KeyError
:
self
.
_json
.
continuable_errors
.
append
(
"Error - Unable to find '
%s
' field definition"
%
(
json_id
,
))
return
False
if
not
field_name
:
field_name
=
json_id
if
(
json_id
,
field_name
)
in
self
.
_parsed
:
return
field_name
in
self
.
_json
self
.
_parsed
.
append
((
json_id
,
field_name
))
# In this two method calls the decorators are never apply because of
# default types, i.e. when keywords are evaluated the first keyword
# which is parsed creates a string not a list, therefore all the
# extensions and decorator that are expecting a list will fail.
self
.
_apply_rules
(
json_id
,
field_name
,
rule
)
self
.
_apply_virtual_rules
(
json_id
,
field_name
,
rule
)
self
.
_set_default_value
(
json_id
,
field_name
)
self
.
_set_default_type
(
json_id
,
field_name
)
self
.
_evaluate_after_decorators
(
field_name
)
return
field_name
in
self
.
_json
def
_apply_rules
(
self
,
json_id
,
field_name
,
rule
):
"""Try to apply a 'creator' rule.
:param json_id: Name os the json field in the configuration file.
:param field_name: Final name of the field, taken from the model
definiti, if any, otherwise is equal to the `json_id`
:param rule: Current rule for the `json_id`
"""
for
field_def
in
rule
[
'rules'
]
.
get
(
self
.
_json
.
additional_info
.
master_format
,
[]):
if
not
self
.
_evaluate_before_decorators
(
field_def
):
continue
for
elements
in
\
self
.
_get_elements_from_blob
(
field_def
[
'source_tags'
]):
if
not
isinstance
(
elements
,
(
list
,
tuple
)):
elements
=
(
elements
,
)
for
element
in
elements
:
if
not
self
.
_evaluate_on_decorators
(
field_def
,
element
):
continue
try
:
value
=
try_to_eval
(
field_def
[
'function'
],
functions
(
self
.
_json
.
additional_info
.
namespace
),
value
=
element
,
self
=
self
.
_json
)
self
.
_remove_none_values
(
value
)
info
=
self
.
_find_field_metadata
(
json_id
,
field_name
,
'creator'
,
field_def
)
self
.
_json
[
'__meta_metadata__'
][
field_name
]
=
info
self
.
_json
.
__setitem__
(
field_name
,
value
,
extend
=
True
,
exclude
=
[
'decorators'
,
'extensions'
])
except
Exception
as
e
:
self
.
_json
.
errors
.
append
(
"Rule Error - Unable to apply rule for field "
"'
%s
' with value '
%s
'.
\n
%s
"
%
(
field_name
,
element
,
str
(
e
)),)
def
_apply_virtual_rules
(
self
,
json_id
,
field_name
,
rule
):
"""Try to apply either a 'derived' or 'calculated' rule.
:param json_id: Name os the json field in the configuration file.
:param field_name: Final name of the field, taken from the model
definiti, if any, otherwise is equal to the `json_id`
:param rule: Current rule for the `json_id`
"""
field_defs
=
[]
field_defs
.
append
((
'calculated'
,
rule
[
'rules'
]
.
get
(
'calculated'
,
[])))
field_defs
.
append
((
'derived'
,
rule
[
'rules'
]
.
get
(
'derived'
,
[])))
for
(
field_type
,
_field_def
)
in
field_defs
:
for
field_def
in
_field_def
:
if
not
self
.
_evaluate_before_decorators
(
field_def
):
continue
try
:
value
=
try_to_eval
(
field_def
[
'function'
],
functions
(
self
.
_json
.
additional_info
.
namespace
),
self
=
self
.
_json
)
self
.
_remove_none_values
(
value
)
info
=
self
.
_find_field_metadata
(
json_id
,
field_name
,
field_type
,
field_def
)
self
.
_json
[
'__meta_metadata__'
][
field_name
]
=
info
self
.
_json
.
__setitem__
(
field_name
,
value
,
extend
=
False
,
exclude
=
[
'decorators'
,
'extensions'
])
except
Exception
as
e
:
self
.
_json
.
errors
.
append
(
"Rule Error - Unable to apply rule for virtual "
"field '
%s
'.
\n
%s
"
%
(
field_name
,
str
(
e
)),)
def
_set_default_value
(
self
,
json_id
,
field_name
):
"""Finds the default value inside the schema, if any"""
def
set_default_value
(
field
,
schema
):
"""Helper function to allow subfield default values"""
if
'default'
in
schema
:
return
schema
[
'default'
]()
elif
'schema'
in
schema
:
default
=
dict
()
for
key
,
value
in
six
.
iteritems
(
schema
[
'schema'
]):
default
[
key
]
=
set_default_value
(
key
,
value
)
return
default
return
None
value
=
set_default_value
(
field_name
,
FieldParser
.
field_definitions
(
self
.
_json
.
additional_info
.
namespace
)[
json_id
]
.
get
(
'schema'
,
{})
.
get
(
json_id
,
{}))
if
value
is
not
None
:
if
field_name
not
in
self
.
_json
.
_dict_bson
:
info
=
self
.
_find_field_metadata
(
json_id
,
field_name
,
self
.
_json
.
get
(
field_name
))
self
.
_json
[
'__meta_metadata__'
][
field_name
]
=
info
self
.
_json
.
__setitem__
(
field_name
,
value
,
extend
=
False
,
exclude
=
[
'decorators'
,
'extensions'
])
else
:
old_value
=
self
.
_json
.
__getitem__
(
field_name
,
exclude
=
[
'decorators'
,
'extensions'
])
self
.
_json
.
__setitem__
(
field_name
,
value
,
extend
=
False
,
exclude
=
[
'decorators'
,
'extensions'
])
try
:
self
.
_json
.
_dict_bson
[
field_name
]
.
update
(
old_value
)
except
AttributeError
:
self
.
_json
.
__setitem__
(
field_name
,
old_value
,
extend
=
False
,
exclude
=
[
'decorators'
,
'extensions'
])
def
_set_default_type
(
self
,
json_id
,
field_name
):
"""Finds the default type inside the schema, if `force` is used."""
from
.validator
import
Validator
def
set_default_type
(
field
,
schema
):
"""Helper function to allow subfield default values."""
if
'type'
in
schema
and
schema
.
get
(
'force'
,
False
):
Validator
.
force_type
(
self
.
_json
.
_dict_bson
,
field_name
,
schema
[
'type'
])
elif
'schema'
in
schema
:
for
key
,
value
in
six
.
iteritems
(
schema
[
'schema'
]):
set_default_type
(
'
%s
.
%s
'
%
(
field
,
key
),
value
)
if
field_name
not
in
self
.
_json
.
_dict_bson
:
return
schema
=
FieldParser
.
field_definitions
(
self
.
_json
.
additional_info
.
namespace
)[
json_id
]
\
.
get
(
'schema'
,
{})
.
get
(
json_id
,
{})
set_default_type
(
field_name
,
schema
)
def
_remove_none_values
(
self
,
obj
):
"""Handy method to remove recursively None values from obj."""
if
isinstance
(
obj
,
dict
):
for
key
in
list
(
obj
.
keys
()):
if
obj
[
key
]
is
None
:
del
obj
[
key
]
else
:
self
.
_remove_none_values
(
obj
[
key
])
if
isinstance
(
obj
,
list
):
for
element
in
obj
:
if
element
is
None
:
obj
.
remove
(
element
)
else
:
self
.
_remove_none_values
(
element
)
def
_update
(
self
,
fields
):
"""From the list of field names it tries to update their content."""
# TODO
raise
NotImplementedError
(
'Missing implementation in current version'
)
def
_find_field_metadata
(
self
,
json_id
,
field_name
,
field_type
=
None
,
field_def
=
None
):
"""
Given one field definition fills up the parallel dictionary with the
needed meta-metadata, inlcuding field extensions and after decorators.
If the information regarding the field definition is no present, the
first one available will be used: first ``creator`` rules for the
master format of the json, then ``derived`` and finally ``calculated``.
For each of them if more than one definition is present the first one
will be used.
If no rule is found the field info will be tag as ``UNKNOWN``
:return: dictionary
"""
try
:
rule
=
FieldParser
.
field_definitions
(
self
.
_json
.
additional_info
.
namespace
)[
json_id
]
except
KeyError
:
self
.
_json
.
continuable_errors
.
append
(
"Adding a new field '
%s
' ('
%s
') without definition"
%
(
field_name
,
json_id
))
rule
=
{}
field_def
=
{}
field_type
=
'UNKNOWN'
if
field_def
is
None
:
if
self
.
_json
.
additional_info
.
master_format
in
\
rule
.
get
(
'rules'
,
{}):
field_def
=
rule
[
'rules'
][
self
.
_json
.
additional_info
.
master_format
][
0
]
field_type
=
'creator'
elif
'derived'
in
rule
.
get
(
'rules'
,
{}):
field_def
=
rule
[
'rules'
][
'derived'
][
0
]
field_type
=
'derived'
elif
'calculated'
in
rule
.
get
(
'rules'
,
{}):
field_def
=
rule
[
'rules'
][
'calculated'
][
0
]
field_type
=
'calculated'
else
:
field_def
=
{}
field_type
=
'UNKNOWN'
for
alias
in
rule
.
get
(
'aliases'
,
[]):
self
.
_json
[
'__meta_metadata__'
][
'__aliases__'
][
alias
]
=
field_name
info
=
{}
info
[
'json_id'
]
=
json_id
info
[
'timestamp'
]
=
datetime
.
datetime
.
now
()
.
isoformat
()
info
[
'pid'
]
=
rule
.
get
(
'pid'
,
None
)
info
[
'type'
]
=
field_type
if
field_type
in
(
'calculated'
,
'derived'
):
info
[
'function'
]
=
(
json_id
,
'rules'
,
field_type
,
0
,
'function'
)
elif
field_type
==
'UNKNOWN'
:
info
[
'function'
]
=
'UNKNOWN'
else
:
info
[
'function'
]
=
field_def
[
'source_tags'
]
# Decorator extensions
info
[
'after'
]
=
dict
()
for
name
,
parser
in
\
six
.
iteritems
(
FieldParser
.
decorator_after_extensions
()):
try
:
ext
=
parser
.
add_info_to_field
(
json_id
,
info
,
field_def
[
'decorators'
][
'after'
]
.
get
(
name
))
if
ext
is
not
None
:
info
[
'after'
][
name
]
=
ext
except
KeyError
as
e
:
# Only raise if the error is different the KeyError
# 'decorators'
if
not
e
.
args
[
0
]
==
'decorators'
:
raise
e
# Field extensions
info
[
'ext'
]
=
dict
()
for
name
,
parser
in
six
.
iteritems
(
FieldParser
.
field_extensions
()):
try
:
ext
=
parser
.
add_info_to_field
(
json_id
,
rule
)
if
ext
is
not
None
:
info
[
'ext'
][
name
]
=
ext
except
NotImplementedError
:
# Maybe your extension doesn't have anything to add to the
# field
pass
return
info
def
_update_meta_metadata
(
self
,
fields
=
None
,
section
=
None
,
keep_core_values
=
True
,
store_backup
=
True
):
"""
Given one field definition fills up the parallel dictionary with the
needed meta-metadata, including field extensions and after decorators.
If there is some information about this field in the json structure it
will keep some core information like the source format.
"""
# TODO
raise
NotImplementedError
(
'Missing implementation on this version'
)
def
_evaluate_before_decorators
(
self
,
field_def
):
"""Evaluates all the before decorators (they must return a boolean)."""
for
name
,
content
in
six
.
iteritems
(
field_def
[
'decorators'
][
'before'
]):
if
not
FieldParser
.
decorator_before_extensions
()[
name
]
\
.
evaluate
(
self
,
content
):
return
False
return
True
def
_evaluate_on_decorators
(
self
,
field_def
,
master_value
):
"""Evaluates all the on decorators (they must return a boolean."""
for
name
,
content
in
six
.
iteritems
(
field_def
[
'decorators'
][
'on'
]):
if
not
FieldParser
.
decorator_on_extensions
()[
name
]
\
.
evaluate
(
master_value
,
self
.
_json
.
additional_info
.
namespace
,
content
):
return
False
return
True
def
_evaluate_after_decorators
(
self
,
field_name
):
"""Evaluates all the after decorators."""
if
field_name
not
in
self
.
_json
.
_dict_bson
:
return
for
ext
,
args
in
\
six
.
iteritems
(
self
.
_json
.
meta_metadata
[
field_name
][
'after'
]):
FieldParser
.
decorator_after_extensions
()[
ext
]
\
.
evaluate
(
self
.
_json
,
field_name
,
'set'
,
args
)
for
ext
,
args
in
\
six
.
iteritems
(
self
.
_json
.
meta_metadata
[
field_name
][
'ext'
]):
FieldParser
.
field_extensions
()[
ext
]
\
.
evaluate
(
self
.
_json
,
field_name
,
'set'
,
args
)
Event Timeline
Log In to Comment